diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e6c51bc
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,50 @@
+SHARP_TARGET?=auto
+ifndef SHARP_TARGET
+  SHARP_TARGET:=$(error SHARP_TARGET undefined. Please see README.compilation for help)UNDEFINED
+endif
+
+default: compile_all
+SRCROOT:=$(shell pwd)
+include $(SRCROOT)/config/config.$(SHARP_TARGET)
+include $(SRCROOT)/config/rules.common
+
+all_hdr:=
+all_lib:=
+all_cbin:=
+
+FULL_INCLUDE:=
+
+include c_utils/planck.make
+include libfftpack/planck.make
+include libsharp/planck.make
+include docsrc/planck.make
+
+$(all_lib): %: | $(LIBDIR)_mkdir
+	@echo "#  creating library $*"
+	$(ARCREATE) $@ $^
+
+$(all_cbin): %: | $(BINDIR)_mkdir
+	@echo "#  linking C binary $*"
+	$(CL) -o $@ $^ $(CLFLAGS)
+#	$(CXX) -o $@ $^ $(CLFLAGS)
+
+compile_all: $(all_cbin) hdrcopy
+
+autotune: sharp_bench
+	$(BINDIR)/sharp_bench
+	mv oracle.inc $(SRCROOT)/libsharp
+	$(MAKE)
+
+hdrclean:
+	@if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi
+
+hdrcopy: | $(INCDIR)_mkdir
+	@if [ "$(all_hdr)" ]; then cp -p $(all_hdr) $(INCDIR); fi
+
+$(notdir $(all_cbin)) : % : $(BINDIR)/%
+
+test: compile_all
+	$(BINDIR)/sharp_acctest && \
+	$(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \
+	$(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \
+	$(BINDIR)/sharp_test gauss 2047 4096 0 0 2
diff --git a/README.compilation b/README.compilation
new file mode 100644
index 0000000..7607750
--- /dev/null
+++ b/README.compilation
@@ -0,0 +1,16 @@
+GNU make and GNU gcc (version 4.x) are required for compilation.
+
+Simply run "./configure"; if this fails, please refer to the output of
+"./configure --help" for additional hints and, if necessary, provide
+additional flags to the configure script.
+Once the script finishes successfully, run "make autotune"
+(or "gmake autotune"). This should perform some necessary self-tuning and
+install the compilation products in the subdirectory "auto/".
+NOTE: Autotuning should be done on the the computer where you wish to use
+the library later on, and no other CPU-intensive tasks should be running
+during the autotuning process.
+
+Documentation can be created by the command "(g)make doc".
+However this requires the doxygen application to be installed
+on your system.
+The documentation will be created in the subdirectory doc/.
diff --git a/c_utils/c_utils.c b/c_utils/c_utils.c
new file mode 100644
index 0000000..d8601e7
--- /dev/null
+++ b/c_utils/c_utils.c
@@ -0,0 +1,145 @@
+/*
+ *  This file is part of libc_utils.
+ *
+ *  libc_utils is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libc_utils is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libc_utils; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  Convenience functions
+ *
+ *  Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "c_utils.h"
+#include "vec_utils.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+void util_fail_ (const char *file, int line, const char *func, const char *msg)
+  {
+  fprintf(stderr,"%s, %i (%s):\n%s\n",file,line,func,msg);
+  exit(1);
+  }
+void util_warn_ (const char *file, int line, const char *func, const char *msg)
+  {
+  fprintf(stderr,"%s, %i (%s):\n%s\n",file,line,func,msg);
+  }
+
+/* This function tries to avoid allocations with a total size close to a high
+   power of two (called the "critical stride" here), by adding a few more bytes
+   if necssary. This lowers the probability that two arrays differ by a multiple
+   of the critical stride in their starting address, which in turn lowers the
+   risk of cache line contention. */
+static size_t manipsize(size_t sz)
+  {
+  const size_t critical_stride=4096, cacheline=64, overhead=32;
+  if (sz < (critical_stride/2)) return sz;
+  if (((sz+overhead)%critical_stride)>(2*cacheline)) return sz;
+  return sz+2*cacheline;
+  }
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+void *util_malloc_ (size_t sz)
+  {
+  void *res;
+  if (sz==0) return NULL;
+  res = _mm_malloc(manipsize(sz),16);
+  UTIL_ASSERT(res,"_mm_malloc() failed");
+  return res;
+  }
+void util_free_ (void *ptr)
+  { if ((ptr)!=NULL) _mm_free(ptr); }
+#else
+void *util_malloc_ (size_t sz)
+  {
+  void *res;
+  if (sz==0) return NULL;
+  res = malloc(manipsize(sz));
+  UTIL_ASSERT(res,"malloc() failed");
+  return res;
+  }
+void util_free_ (void *ptr)
+  { if ((ptr)!=NULL) free(ptr); }
+#endif
+
+static void OpenMP_status(void)
+  {
+#ifndef _OPENMP
+  printf("OpenMP: not supported by this binary\n");
+#else
+  int threads = omp_get_max_threads();
+  if (threads>1)
+    printf("OpenMP active: max. %d threads.\n",threads);
+  else
+    printf("OpenMP active, but running with 1 thread only.\n");
+#endif
+  }
+
+static void MPI_status(void)
+  {
+#ifndef USE_MPI
+  printf("MPI: not supported by this binary\n");
+#else
+  int tasks;
+  MPI_Comm_size(MPI_COMM_WORLD,&tasks);
+  if (tasks>1)
+    printf("MPI active with %d tasks.\n",tasks);
+  else
+    printf("MPI active, but running with 1 task only.\n");
+#endif
+  }
+
+static void vecmath_status(void)
+  { printf("Supported vector length: %d\n",VLEN); }
+
+void announce_c (const char *name)
+  {
+  size_t m, nlen=strlen(name);
+  printf("\n+-");
+  for (m=0; m<nlen; ++m) printf("-");
+  printf("-+\n");
+  printf("| %s |\n", name);
+  printf("+-");
+  for (m=0; m<nlen; ++m) printf("-");
+  printf("-+\n\n");
+  vecmath_status();
+  OpenMP_status();
+  MPI_status();
+  printf("\n");
+  }
+
+void module_startup_c (const char *name, int argc, int argc_expected,
+  const char *argv_expected, int verbose)
+  {
+  if (verbose) announce_c (name);
+  if (argc==argc_expected) return;
+  if (verbose) fprintf(stderr, "Usage: %s %s\n", name, argv_expected);
+  exit(1);
+  }
diff --git a/c_utils/c_utils.h b/c_utils/c_utils.h
new file mode 100644
index 0000000..db6059c
--- /dev/null
+++ b/c_utils/c_utils.h
@@ -0,0 +1,151 @@
+/*
+ *  This file is part of libc_utils.
+ *
+ *  libc_utils is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libc_utils is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libc_utils; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file c_utils.h
+ *  Convenience functions
+ *
+ *  Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society
+ *  \author Martin Reinecke
+ *  \note This file should only be included from .c files, NOT from .h files.
+ */
+
+#ifndef PLANCK_C_UTILS_H
+#define PLANCK_C_UTILS_H
+
+#include <math.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void util_fail_ (const char *file, int line, const char *func, const char *msg);
+void util_warn_ (const char *file, int line, const char *func, const char *msg);
+void *util_malloc_ (size_t sz);
+void util_free_ (void *ptr);
+
+void announce_c (const char *name);
+void module_startup_c (const char *name, int argc, int argc_expected,
+  const char *argv_expected, int verbose);
+
+#if defined (__GNUC__)
+#define UTIL_FUNC_NAME__ __func__
+#else
+#define UTIL_FUNC_NAME__ "unknown"
+#endif
+
+/*! \def UTIL_ASSERT(cond,msg)
+    If \a cond is false, print an error message containing function name,
+    source file name and line number of the call, as well as \a msg;
+    then exit the program with an error status. */
+#define UTIL_ASSERT(cond,msg) \
+  if(!(cond)) util_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
+/*! \def UTIL_WARN(cond,msg)
+    If \a cond is false, print an warning containing function name,
+    source file name and line number of the call, as well as \a msg. */
+#define UTIL_WARN(cond,msg) \
+  if(!(cond)) util_warn_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
+/*! \def UTIL_FAIL(msg)
+    Print an error message containing function name,
+    source file name and line number of the call, as well as \a msg;
+    then exit the program with an error status. */
+#define UTIL_FAIL(msg) \
+  util_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg)
+
+/*! \def ALLOC(ptr,type,num)
+    Allocate space for \a num objects of type \a type. Make sure that the
+    allocation succeeded, else stop the program with an error. Return the
+    resulting pointer in \a ptr. */
+#define ALLOC(ptr,type,num) \
+  do { (ptr)=(type *)util_malloc_((num)*sizeof(type)); } while (0)
+/*! \def RALLOC(type,num)
+    Allocate space for \a num objects of type \a type. Make sure that the
+    allocation succeeded, else stop the program with an error. Cast the
+    resulting pointer to \a (type*). */
+#define RALLOC(type,num) \
+  ((type *)util_malloc_((num)*sizeof(type)))
+/*! \def DEALLOC(ptr)
+    Deallocate \a ptr. It must have been allocated using \a ALLOC or
+    \a RALLOC. */
+#define DEALLOC(ptr) \
+  do { util_free_(ptr); (ptr)=NULL; } while(0)
+#define RESIZE(ptr,type,num) \
+  do { util_free_(ptr); ALLOC(ptr,type,num); } while(0)
+#define GROW(ptr,type,sz_old,sz_new) \
+  do { \
+    if ((sz_new)>(sz_old)) \
+      { RESIZE(ptr,type,2*(sz_new));sz_old=2*(sz_new); } \
+  } while(0)
+/*! \def SET_ARRAY(ptr,i1,i2,val)
+    Set the entries \a ptr[i1] ... \a ptr[i2-1] to \a val. */
+#define SET_ARRAY(ptr,i1,i2,val) \
+  do { \
+    ptrdiff_t cnt_; \
+    for (cnt_=(i1);cnt_<(i2);++cnt_) (ptr)[cnt_]=(val); \
+    } while(0)
+/*! \def COPY_ARRAY(src,dest,i1,i2)
+    Copy the entries \a src[i1] ... \a src[i2-1] to
+    \a dest[i1] ... \a dest[i2-1]. */
+#define COPY_ARRAY(src,dest,i1,i2) \
+  do { \
+    ptrdiff_t cnt_; \
+    for (cnt_=(i1);cnt_<(i2);++cnt_) (dest)[cnt_]=(src)[cnt_]; \
+    } while(0)
+
+#define ALLOC2D(ptr,type,num1,num2) \
+  do { \
+    size_t cnt_, num1_=(num1), num2_=(num2); \
+    ALLOC(ptr,type *,num1_); \
+    ALLOC(ptr[0],type,num1_*num2_); \
+    for (cnt_=1; cnt_<num1_; ++cnt_) \
+      ptr[cnt_]=ptr[cnt_-1]+num2_; \
+    } while(0)
+#define DEALLOC2D(ptr) \
+  do { if(ptr) DEALLOC((ptr)[0]); DEALLOC(ptr); } while(0)
+
+#define FAPPROX(a,b,eps) \
+  (fabs((a)-(b))<((eps)*fabs(b)))
+#define ABSAPPROX(a,b,eps) \
+  (fabs((a)-(b))<(eps))
+#define IMAX(a,b) \
+  (((a)>(b)) ? (a) : (b))
+#define IMIN(a,b) \
+  (((a)<(b)) ? (a) : (b))
+
+#define SWAP(a,b,type) \
+  do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
+
+#define CHECK_STACK_ALIGN(align) \
+  do { \
+    double foo; \
+    UTIL_WARN((((size_t)(&foo))&(align-1))==0, \
+      "WARNING: stack not sufficiently aligned!"); \
+    } while(0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/c_utils/planck.make b/c_utils/planck.make
new file mode 100644
index 0000000..f4a2d30
--- /dev/null
+++ b/c_utils/planck.make
@@ -0,0 +1,18 @@
+PKG:=c_utils
+
+SD:=$(SRCROOT)/$(PKG)
+OD:=$(BLDROOT)/$(PKG)
+
+FULL_INCLUDE+= -I$(SD)
+
+HDR_$(PKG):=$(SD)/*.h
+LIB_$(PKG):=$(LIBDIR)/libc_utils.a
+
+OBJ:=c_utils.o walltime_c.o
+OBJ:=$(OBJ:%=$(OD)/%)
+
+$(OBJ): $(HDR_$(PKG)) | $(OD)_mkdir
+$(LIB_$(PKG)): $(OBJ)
+
+all_hdr+=$(HDR_$(PKG))
+all_lib+=$(LIB_$(PKG))
diff --git a/c_utils/vec_utils.h b/c_utils/vec_utils.h
new file mode 100644
index 0000000..50066f8
--- /dev/null
+++ b/c_utils/vec_utils.h
@@ -0,0 +1,43 @@
+/*
+ *  This file is part of libc_utils.
+ *
+ *  libc_utils is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libc_utils is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libc_utils; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file vec_utils.h
+ *  Functionality related to vector instruction support
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_VEC_UTILS_H
+#define PLANCK_VEC_UTILS_H
+
+#if (defined (__AVX__) && (!defined (DISABLE_AVX)) && (!defined (DISABLE_SSE2)))
+#define VLEN 4
+#elif (defined (__SSE2__) && (!defined (DISABLE_SSE2)))
+#define VLEN 2
+#else
+#define VLEN 1
+#endif
+
+#endif
diff --git a/c_utils/walltime_c.c b/c_utils/walltime_c.c
new file mode 100644
index 0000000..c9dce3a
--- /dev/null
+++ b/c_utils/walltime_c.c
@@ -0,0 +1,54 @@
+/*
+ *  This file is part of libc_utils.
+ *
+ *  libc_utils is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libc_utils is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libc_utils; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  Functionality for reading wall clock time
+ *
+ *  Copyright (C) 2010, 2011 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#if defined (_OPENMP)
+#include <omp.h>
+#elif defined (USE_MPI)
+#include "mpi.h"
+#else
+#include <sys/time.h>
+#include <stdlib.h>
+#endif
+
+#include "walltime_c.h"
+
+double wallTime(void)
+  {
+#if defined (_OPENMP)
+  return omp_get_wtime();
+#elif defined (USE_MPI)
+  return MPI_Wtime();
+#else
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  return t.tv_sec + 1e-6*t.tv_usec;
+#endif
+  }
diff --git a/c_utils/walltime_c.h b/c_utils/walltime_c.h
new file mode 100644
index 0000000..ea9d2a2
--- /dev/null
+++ b/c_utils/walltime_c.h
@@ -0,0 +1,53 @@
+/*
+ *  This file is part of libc_utils.
+ *
+ *  libc_utils is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libc_utils is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libc_utils; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file walltime_c.h
+ *  Functionality for reading wall clock time
+ *
+ *  Copyright (C) 2010 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_WALLTIME_C_H
+#define PLANCK_WALLTIME_C_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Returns an approximation of the current wall time (in seconds).
+    The first available of the following timers will be used:
+    <ul>
+    <li> \a omp_get_wtime(), if OpenMP is available
+    <li> \a MPI_Wtime(), if MPI is available
+    <li> \a gettimeofday() otherwise
+    </ul>
+    \note Only useful for measuring time differences. */
+double wallTime(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/config/config.auto.in b/config/config.auto.in
new file mode 100644
index 0000000..32b340b
--- /dev/null
+++ b/config/config.auto.in
@@ -0,0 +1,9 @@
+@SILENT_RULE@
+
+CC=@CC@
+CL=@CC@
+CCFLAGS_NO_C=@CCFLAGS_NO_C@
+CCFLAGS=$(CCFLAGS_NO_C) -c
+CLFLAGS=-L. -L$(LIBDIR) @LDCCFLAGS@ -lm
+
+ARCREATE=@ARCREATE@
diff --git a/config/rules.common b/config/rules.common
new file mode 100644
index 0000000..419584d
--- /dev/null
+++ b/config/rules.common
@@ -0,0 +1,31 @@
+BLDROOT = $(SRCROOT)/build.$(SHARP_TARGET)
+PREFIX  = $(SRCROOT)/$(SHARP_TARGET)
+BINDIR	= $(PREFIX)/bin
+INCDIR	= $(PREFIX)/include
+LIBDIR	= $(PREFIX)/lib
+DOCDIR	= $(SRCROOT)/doc
+
+# do not use any suffix rules
+.SUFFIXES:
+# do not use any default rules
+.DEFAULT:
+
+echo_config:
+	@echo using configuration \'$(SHARP_TARGET)\'
+
+$(BLDROOT)/%.o : $(SRCROOT)/%.c | echo_config
+	@echo "#  compiling $*.c"
+	cd $(@D) && $(CC) $(FULL_INCLUDE) -I$(BLDROOT) $(CCFLAGS) $<
+
+$(BLDROOT)/%.o : $(SRCROOT)/%.cc | echo_config
+	@echo "#  compiling $*.cc"
+	cd $(@D) && $(CXX) $(FULL_INCLUDE) -I$(BLDROOT) $(CXXCFLAGS) $<
+
+%_mkdir:
+	@if [ ! -d $* ]; then mkdir -p $* ; fi
+
+clean:
+	rm -rf $(BLDROOT) $(PREFIX) $(DOCDIR) autom4te.cache/ config.log config.status
+
+distclean: clean
+	rm -f config/config.auto
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..cb3ddc6
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,123 @@
+AC_INIT(config/config.auto.in)
+
+AC_CHECK_PROG([uname_found],[uname],[1],[0])
+if test $uname_found -eq 0 ; then
+    echo "No uname found; setting system type to unknown."
+    system="unknown"
+else
+    system=`uname -s`-`uname -r`
+fi
+AC_LANG([C])
+
+AC_TRY_COMPILE([], [@%:@ifndef __INTEL_COMPILER
+choke me
+@%:@endif], [ICC=[yes]], [ICC=[no]])
+
+if test $ICC = yes; then GCC=no; fi
+CCTYPE=unknown
+if test $GCC = yes; then CCTYPE=gcc; fi
+if test $ICC = yes; then CCTYPE=icc; fi
+AC_OPENMP
+
+SILENT_RULE=".SILENT:"
+AC_ARG_ENABLE(noisy-make,
+  [  --enable-noisy-make     enable detailed make output],
+  [if test "$enableval" = yes; then
+     SILENT_RULE=""
+   fi])
+
+ENABLE_MPI=no
+AC_ARG_ENABLE(mpi,
+  [  --enable-mpi            enable generation of MPI-parallel code],
+  [if test "$enableval" = yes; then
+     ENABLE_MPI=yes
+   fi])
+
+ENABLE_DEBUG=no
+AC_ARG_ENABLE(debug,
+  [  --enable-debug          enable generation of debugging symbols],
+  [if test "$enableval" = yes; then
+     ENABLE_DEBUG=yes
+   fi])
+
+ENABLE_SSE2=yes
+AC_ARG_ENABLE(sse2,
+  [  --disable-sse2          disable generation of SSE2 instructions],
+  [if test "$enableval" = no; then
+     ENABLE_SSE2=no
+   fi])
+
+ENABLE_AVX=yes
+AC_ARG_ENABLE(avx,
+  [  --disable-avx           disable generation of AVX instructions],
+  [if test "$enableval" = no; then
+     ENABLE_AVX=no
+   fi])
+
+case $CCTYPE in
+  gcc)
+    CCFLAGS="-O3 -fno-tree-vectorize -ffast-math -fomit-frame-pointer -std=c99 -pedantic -Wextra -Wall -Wno-unknown-pragmas -Wshadow -Wmissing-prototypes -Wfatal-errors"
+    GCCVERSION="`$CC -dumpversion 2>&1`"
+    echo "Using gcc version $GCCVERSION"
+    AC_SUBST(GCCVERSION)
+    case $system in
+      Darwin-*)
+        ;;
+      *)
+        CCFLAGS="$CCFLAGS -ffunction-sections -fdata-sections"
+        ;;
+    esac
+    changequote(,)
+    gcc43=`echo $GCCVERSION | grep -c '4\.[3456789]'`
+    changequote([,])
+    if test $gcc43 -gt 0; then
+      CCFLAGS="$CCFLAGS -march=native"
+    fi
+    ;;
+  icc)
+    CCFLAGS="-O3 -xHOST -std=c99 -ip -Wbrief -Wall -vec-report0 -openmp-report0 -wd383,981,1419,1572"
+    ;;
+  *)
+    CCFLAGS="-O2"
+    # Don't do anything now
+    ;;
+esac
+
+case $system in
+  Darwin-*)
+    ARCREATE="libtool -static -o"
+    ;;
+  *)
+    ARCREATE="ar cr"
+    ;;
+esac
+
+CCFLAGS="$CCFLAGS $OPENMP_CFLAGS"
+
+if test $ENABLE_DEBUG = yes; then
+  CCFLAGS="$CCFLAGS -g"
+fi
+
+if test $ENABLE_MPI = yes; then
+  CCFLAGS="$CCFLAGS -DUSE_MPI"
+fi
+
+if test $ENABLE_SSE2 = no; then
+  CCFLAGS="$CCFLAGS -DDISABLE_SSE2"
+fi
+
+if test $ENABLE_AVX = no; then
+  CCFLAGS="$CCFLAGS -DDISABLE_AVX"
+fi
+
+CCFLAGS_NO_C="$CCFLAGS $CPPFLAGS"
+
+LDCCFLAGS="$LDFLAGS $CCFLAGS"
+
+AC_SUBST(SILENT_RULE)
+AC_SUBST(CC)
+AC_SUBST(CCFLAGS_NO_C)
+AC_SUBST(LDCCFLAGS)
+AC_SUBST(ARCREATE)
+
+AC_OUTPUT(config/config.auto)
diff --git a/docsrc/c_utils.dox b/docsrc/c_utils.dox
new file mode 100644
index 0000000..daf432f
--- /dev/null
+++ b/docsrc/c_utils.dox
@@ -0,0 +1,290 @@
+# Doxyfile 1.8.1
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+DOXYFILE_ENCODING      = UTF-8
+PROJECT_NAME           = "LevelS C support library"
+PROJECT_NUMBER         = 0.1
+PROJECT_BRIEF          =
+PROJECT_LOGO           =
+OUTPUT_DIRECTORY       = .
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+BRIEF_MEMBER_DESC      = NO
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       =
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        =
+STRIP_FROM_INC_PATH    =
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+QT_AUTOBRIEF           = NO
+MULTILINE_CPP_IS_BRIEF = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 8
+ALIASES                =
+TCL_SUBST              =
+OPTIMIZE_OUTPUT_FOR_C  = YES
+OPTIMIZE_OUTPUT_JAVA   = NO
+OPTIMIZE_FOR_FORTRAN   = NO
+OPTIMIZE_OUTPUT_VHDL   = NO
+EXTENSION_MAPPING      =
+MARKDOWN_SUPPORT       = YES
+BUILTIN_STL_SUPPORT    = NO
+CPP_CLI_SUPPORT        = NO
+SIP_SUPPORT            = NO
+IDL_PROPERTY_SUPPORT   = YES
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+INLINE_GROUPED_CLASSES = NO
+INLINE_SIMPLE_STRUCTS  = NO
+TYPEDEF_HIDES_STRUCT   = NO
+SYMBOL_CACHE_SIZE      = 0
+LOOKUP_CACHE_SIZE      = 0
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_PACKAGE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+EXTRACT_ANON_NSPACES   = NO
+HIDE_UNDOC_MEMBERS     = YES
+HIDE_UNDOC_CLASSES     = YES
+HIDE_FRIEND_COMPOUNDS  = YES
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+FORCE_LOCAL_INCLUDES   = NO
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = NO
+SORT_BRIEF_DOCS        = NO
+SORT_MEMBERS_CTORS_1ST = NO
+SORT_GROUP_NAMES       = NO
+SORT_BY_SCOPE_NAME     = NO
+STRICT_PROTO_MATCHING  = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       =
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_FILES             = YES
+SHOW_NAMESPACES        = YES
+FILE_VERSION_FILTER    =
+LAYOUT_FILE            =
+CITE_BIB_FILES         =
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = NO
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           =
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = ../c_utils
+INPUT_ENCODING         = UTF-8
+FILE_PATTERNS          = *.h \
+                         *.c \
+                         *.dox
+RECURSIVE              = YES
+EXCLUDE                =
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       =
+EXCLUDE_SYMBOLS        =
+EXAMPLE_PATH           =
+EXAMPLE_PATTERNS       =
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             =
+INPUT_FILTER           =
+FILTER_PATTERNS        =
+FILTER_SOURCE_FILES    = NO
+FILTER_SOURCE_PATTERNS =
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = YES
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = NO
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+REFERENCES_LINK_SOURCE = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          =
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = htmldoc
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            =
+HTML_FOOTER            = footer.html
+HTML_STYLESHEET        =
+HTML_EXTRA_FILES       =
+HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_SAT    = 100
+HTML_COLORSTYLE_GAMMA  = 80
+HTML_TIMESTAMP         = YES
+HTML_DYNAMIC_SECTIONS  = NO
+HTML_INDEX_NUM_ENTRIES = 100
+GENERATE_DOCSET        = NO
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+DOCSET_PUBLISHER_NAME  = Publisher
+GENERATE_HTMLHELP      = NO
+CHM_FILE               =
+HHC_LOCATION           =
+GENERATE_CHI           = NO
+CHM_INDEX_ENCODING     =
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+GENERATE_QHP           = NO
+QCH_FILE               =
+QHP_NAMESPACE          = org.doxygen.Project
+QHP_VIRTUAL_FOLDER     = doc
+QHP_CUST_FILTER_NAME   =
+QHP_CUST_FILTER_ATTRS  =
+QHP_SECT_FILTER_ATTRS  =
+QHG_LOCATION           =
+GENERATE_ECLIPSEHELP   = NO
+ECLIPSE_DOC_ID         = org.doxygen.Project
+DISABLE_INDEX          = NO
+GENERATE_TREEVIEW      = NO
+ENUM_VALUES_PER_LINE   = 4
+TREEVIEW_WIDTH         = 250
+EXT_LINKS_IN_WINDOW    = NO
+FORMULA_FONTSIZE       = 10
+FORMULA_TRANSPARENT    = YES
+USE_MATHJAX            = NO
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_EXTENSIONS     =
+SEARCHENGINE           = NO
+SERVER_BASED_SEARCH    = NO
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = NO
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = YES
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         =
+LATEX_HEADER           =
+LATEX_FOOTER           =
+PDF_HYPERLINKS         = YES
+USE_PDFLATEX           = YES
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+LATEX_SOURCE_CODE      = NO
+LATEX_BIB_STYLE        = plain
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    =
+RTF_EXTENSIONS_FILE    =
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = NO
+XML_OUTPUT             = xml
+XML_SCHEMA             =
+XML_DTD                =
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX =
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           =
+INCLUDE_FILE_PATTERNS  =
+PREDEFINED             =
+EXPAND_AS_DEFINED      =
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+TAGFILES               =
+GENERATE_TAGFILE       = c_utils.tag
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+MSCGEN_PATH            =
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = NO
+DOT_NUM_THREADS        = 0
+DOT_FONTNAME           = FreeSans
+DOT_FONTSIZE           = 10
+DOT_FONTPATH           =
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+UML_LIMIT_NUM_FIELDS   = 10
+TEMPLATE_RELATIONS     = YES
+INCLUDE_GRAPH          = NO
+INCLUDED_BY_GRAPH      = NO
+CALL_GRAPH             = NO
+CALLER_GRAPH           = NO
+GRAPHICAL_HIERARCHY    = NO
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+INTERACTIVE_SVG        = NO
+DOT_PATH               =
+DOTFILE_DIRS           =
+MSCFILE_DIRS           =
+DOT_GRAPH_MAX_NODES    = 50
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = NO
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
diff --git a/docsrc/footer.html b/docsrc/footer.html
new file mode 100644
index 0000000..6f5dbf0
--- /dev/null
+++ b/docsrc/footer.html
@@ -0,0 +1,5 @@
+<hr><address style="align: right;"><small>
+Generated on $datetime for $projectname
+</a> </small></address>
+</body>
+</html>
diff --git a/docsrc/index_code.html b/docsrc/index_code.html
new file mode 100644
index 0000000..d8a001d
--- /dev/null
+++ b/docsrc/index_code.html
@@ -0,0 +1,15 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
+<title>Libsharp source code documentation</title>
+</head><body>
+<H1>Libsharp source code documentation</H1>
+
+<H2>C interfaces</H2>
+
+<ul>
+<li><a href="c_utils/index.html">C support library</a>
+<li><a href="libfftpack/index.html">FFT interface</a>
+<li><a href="libsharp/index.html">Library for spherical harmonic transforms</a>
+</ul>
+</body>
+</html>
diff --git a/docsrc/libfftpack.dox b/docsrc/libfftpack.dox
new file mode 100644
index 0000000..7ff2c23
--- /dev/null
+++ b/docsrc/libfftpack.dox
@@ -0,0 +1,290 @@
+# Doxyfile 1.8.1
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+DOXYFILE_ENCODING      = UTF-8
+PROJECT_NAME           = "LevelS FFT library"
+PROJECT_NUMBER         = 0.1
+PROJECT_BRIEF          =
+PROJECT_LOGO           =
+OUTPUT_DIRECTORY       = .
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+BRIEF_MEMBER_DESC      = NO
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       =
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        =
+STRIP_FROM_INC_PATH    =
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+QT_AUTOBRIEF           = NO
+MULTILINE_CPP_IS_BRIEF = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 8
+ALIASES                =
+TCL_SUBST              =
+OPTIMIZE_OUTPUT_FOR_C  = YES
+OPTIMIZE_OUTPUT_JAVA   = NO
+OPTIMIZE_FOR_FORTRAN   = NO
+OPTIMIZE_OUTPUT_VHDL   = NO
+EXTENSION_MAPPING      =
+MARKDOWN_SUPPORT       = YES
+BUILTIN_STL_SUPPORT    = NO
+CPP_CLI_SUPPORT        = NO
+SIP_SUPPORT            = NO
+IDL_PROPERTY_SUPPORT   = YES
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+INLINE_GROUPED_CLASSES = NO
+INLINE_SIMPLE_STRUCTS  = NO
+TYPEDEF_HIDES_STRUCT   = NO
+SYMBOL_CACHE_SIZE      = 0
+LOOKUP_CACHE_SIZE      = 0
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_PACKAGE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+EXTRACT_ANON_NSPACES   = NO
+HIDE_UNDOC_MEMBERS     = YES
+HIDE_UNDOC_CLASSES     = YES
+HIDE_FRIEND_COMPOUNDS  = YES
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+FORCE_LOCAL_INCLUDES   = NO
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = NO
+SORT_BRIEF_DOCS        = NO
+SORT_MEMBERS_CTORS_1ST = NO
+SORT_GROUP_NAMES       = NO
+SORT_BY_SCOPE_NAME     = NO
+STRICT_PROTO_MATCHING  = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       =
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_FILES             = YES
+SHOW_NAMESPACES        = YES
+FILE_VERSION_FILTER    =
+LAYOUT_FILE            =
+CITE_BIB_FILES         =
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = NO
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           =
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = ../libfftpack
+INPUT_ENCODING         = UTF-8
+FILE_PATTERNS          = *.h \
+                         *.c \
+                         *.dox
+RECURSIVE              = YES
+EXCLUDE                =
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       =
+EXCLUDE_SYMBOLS        =
+EXAMPLE_PATH           =
+EXAMPLE_PATTERNS       =
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             =
+INPUT_FILTER           =
+FILTER_PATTERNS        =
+FILTER_SOURCE_FILES    = NO
+FILTER_SOURCE_PATTERNS =
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = YES
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = NO
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+REFERENCES_LINK_SOURCE = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          =
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = htmldoc
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            =
+HTML_FOOTER            = footer.html
+HTML_STYLESHEET        =
+HTML_EXTRA_FILES       =
+HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_SAT    = 100
+HTML_COLORSTYLE_GAMMA  = 80
+HTML_TIMESTAMP         = YES
+HTML_DYNAMIC_SECTIONS  = NO
+HTML_INDEX_NUM_ENTRIES = 100
+GENERATE_DOCSET        = NO
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+DOCSET_PUBLISHER_NAME  = Publisher
+GENERATE_HTMLHELP      = NO
+CHM_FILE               =
+HHC_LOCATION           =
+GENERATE_CHI           = NO
+CHM_INDEX_ENCODING     =
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+GENERATE_QHP           = NO
+QCH_FILE               =
+QHP_NAMESPACE          = org.doxygen.Project
+QHP_VIRTUAL_FOLDER     = doc
+QHP_CUST_FILTER_NAME   =
+QHP_CUST_FILTER_ATTRS  =
+QHP_SECT_FILTER_ATTRS  =
+QHG_LOCATION           =
+GENERATE_ECLIPSEHELP   = NO
+ECLIPSE_DOC_ID         = org.doxygen.Project
+DISABLE_INDEX          = NO
+GENERATE_TREEVIEW      = NO
+ENUM_VALUES_PER_LINE   = 4
+TREEVIEW_WIDTH         = 250
+EXT_LINKS_IN_WINDOW    = NO
+FORMULA_FONTSIZE       = 10
+FORMULA_TRANSPARENT    = YES
+USE_MATHJAX            = NO
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_EXTENSIONS     =
+SEARCHENGINE           = NO
+SERVER_BASED_SEARCH    = NO
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = NO
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = YES
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         =
+LATEX_HEADER           =
+LATEX_FOOTER           =
+PDF_HYPERLINKS         = YES
+USE_PDFLATEX           = YES
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+LATEX_SOURCE_CODE      = NO
+LATEX_BIB_STYLE        = plain
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    =
+RTF_EXTENSIONS_FILE    =
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = NO
+XML_OUTPUT             = xml
+XML_SCHEMA             =
+XML_DTD                =
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX =
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           =
+INCLUDE_FILE_PATTERNS  =
+PREDEFINED             =
+EXPAND_AS_DEFINED      =
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+TAGFILES               = c_utils.tag=../c_utils
+GENERATE_TAGFILE       = libfftpack.tag
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+MSCGEN_PATH            =
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = NO
+DOT_NUM_THREADS        = 0
+DOT_FONTNAME           = FreeSans
+DOT_FONTSIZE           = 10
+DOT_FONTPATH           =
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+UML_LIMIT_NUM_FIELDS   = 10
+TEMPLATE_RELATIONS     = YES
+INCLUDE_GRAPH          = NO
+INCLUDED_BY_GRAPH      = NO
+CALL_GRAPH             = NO
+CALLER_GRAPH           = NO
+GRAPHICAL_HIERARCHY    = NO
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+INTERACTIVE_SVG        = NO
+DOT_PATH               =
+DOTFILE_DIRS           =
+MSCFILE_DIRS           =
+DOT_GRAPH_MAX_NODES    = 50
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = NO
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
diff --git a/docsrc/libsharp.dox b/docsrc/libsharp.dox
new file mode 100644
index 0000000..b476ab4
--- /dev/null
+++ b/docsrc/libsharp.dox
@@ -0,0 +1,291 @@
+# Doxyfile 1.8.1
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+DOXYFILE_ENCODING      = UTF-8
+PROJECT_NAME           = "LevelS SHT library"
+PROJECT_NUMBER         = 0.1
+PROJECT_BRIEF          =
+PROJECT_LOGO           =
+OUTPUT_DIRECTORY       = .
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+BRIEF_MEMBER_DESC      = NO
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       =
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        =
+STRIP_FROM_INC_PATH    =
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+QT_AUTOBRIEF           = NO
+MULTILINE_CPP_IS_BRIEF = NO
+INHERIT_DOCS           = YES
+SEPARATE_MEMBER_PAGES  = NO
+TAB_SIZE               = 8
+ALIASES                =
+TCL_SUBST              =
+OPTIMIZE_OUTPUT_FOR_C  = YES
+OPTIMIZE_OUTPUT_JAVA   = NO
+OPTIMIZE_FOR_FORTRAN   = NO
+OPTIMIZE_OUTPUT_VHDL   = NO
+EXTENSION_MAPPING      =
+MARKDOWN_SUPPORT       = YES
+BUILTIN_STL_SUPPORT    = NO
+CPP_CLI_SUPPORT        = NO
+SIP_SUPPORT            = NO
+IDL_PROPERTY_SUPPORT   = YES
+DISTRIBUTE_GROUP_DOC   = NO
+SUBGROUPING            = YES
+INLINE_GROUPED_CLASSES = NO
+INLINE_SIMPLE_STRUCTS  = NO
+TYPEDEF_HIDES_STRUCT   = NO
+SYMBOL_CACHE_SIZE      = 0
+LOOKUP_CACHE_SIZE      = 0
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_PACKAGE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+EXTRACT_LOCAL_METHODS  = NO
+EXTRACT_ANON_NSPACES   = NO
+HIDE_UNDOC_MEMBERS     = YES
+HIDE_UNDOC_CLASSES     = YES
+HIDE_FRIEND_COMPOUNDS  = YES
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+FORCE_LOCAL_INCLUDES   = NO
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = NO
+SORT_BRIEF_DOCS        = NO
+SORT_MEMBERS_CTORS_1ST = NO
+SORT_GROUP_NAMES       = NO
+SORT_BY_SCOPE_NAME     = NO
+STRICT_PROTO_MATCHING  = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       =
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = YES
+SHOW_FILES             = YES
+SHOW_NAMESPACES        = YES
+FILE_VERSION_FILTER    =
+LAYOUT_FILE            =
+CITE_BIB_FILES         =
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = NO
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           =
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = ../libsharp
+INPUT_ENCODING         = UTF-8
+FILE_PATTERNS          = *.h \
+                         *.c \
+                         *.dox
+RECURSIVE              = YES
+EXCLUDE                =
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       =
+EXCLUDE_SYMBOLS        =
+EXAMPLE_PATH           =
+EXAMPLE_PATTERNS       =
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             =
+INPUT_FILTER           =
+FILTER_PATTERNS        =
+FILTER_SOURCE_FILES    = NO
+FILTER_SOURCE_PATTERNS =
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = YES
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = NO
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+REFERENCES_LINK_SOURCE = YES
+USE_HTAGS              = NO
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          =
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = htmldoc
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            =
+HTML_FOOTER            = footer.html
+HTML_STYLESHEET        =
+HTML_EXTRA_FILES       =
+HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_SAT    = 100
+HTML_COLORSTYLE_GAMMA  = 80
+HTML_TIMESTAMP         = YES
+HTML_DYNAMIC_SECTIONS  = NO
+HTML_INDEX_NUM_ENTRIES = 100
+GENERATE_DOCSET        = NO
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+DOCSET_PUBLISHER_NAME  = Publisher
+GENERATE_HTMLHELP      = NO
+CHM_FILE               =
+HHC_LOCATION           =
+GENERATE_CHI           = NO
+CHM_INDEX_ENCODING     =
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+GENERATE_QHP           = NO
+QCH_FILE               =
+QHP_NAMESPACE          = org.doxygen.Project
+QHP_VIRTUAL_FOLDER     = doc
+QHP_CUST_FILTER_NAME   =
+QHP_CUST_FILTER_ATTRS  =
+QHP_SECT_FILTER_ATTRS  =
+QHG_LOCATION           =
+GENERATE_ECLIPSEHELP   = NO
+ECLIPSE_DOC_ID         = org.doxygen.Project
+DISABLE_INDEX          = NO
+GENERATE_TREEVIEW      = NO
+ENUM_VALUES_PER_LINE   = 4
+TREEVIEW_WIDTH         = 250
+EXT_LINKS_IN_WINDOW    = NO
+FORMULA_FONTSIZE       = 10
+FORMULA_TRANSPARENT    = YES
+USE_MATHJAX            = NO
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+MATHJAX_EXTENSIONS     =
+SEARCHENGINE           = NO
+SERVER_BASED_SEARCH    = NO
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = NO
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = YES
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         =
+LATEX_HEADER           =
+LATEX_FOOTER           =
+PDF_HYPERLINKS         = YES
+USE_PDFLATEX           = YES
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+LATEX_SOURCE_CODE      = NO
+LATEX_BIB_STYLE        = plain
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    =
+RTF_EXTENSIONS_FILE    =
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = NO
+XML_OUTPUT             = xml
+XML_SCHEMA             =
+XML_DTD                =
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX =
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           =
+INCLUDE_FILE_PATTERNS  =
+PREDEFINED             =
+EXPAND_AS_DEFINED      =
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+TAGFILES               = libfftpack.tag=../libfftpack \
+                         c_utils.tag=../c_utils
+GENERATE_TAGFILE       = libsharp.tag
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+MSCGEN_PATH            =
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = NO
+DOT_NUM_THREADS        = 0
+DOT_FONTNAME           = FreeSans
+DOT_FONTSIZE           = 10
+DOT_FONTPATH           =
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = YES
+UML_LOOK               = NO
+UML_LIMIT_NUM_FIELDS   = 10
+TEMPLATE_RELATIONS     = YES
+INCLUDE_GRAPH          = NO
+INCLUDED_BY_GRAPH      = NO
+CALL_GRAPH             = NO
+CALLER_GRAPH           = NO
+GRAPHICAL_HIERARCHY    = NO
+DIRECTORY_GRAPH        = YES
+DOT_IMAGE_FORMAT       = png
+INTERACTIVE_SVG        = NO
+DOT_PATH               =
+DOTFILE_DIRS           =
+MSCFILE_DIRS           =
+DOT_GRAPH_MAX_NODES    = 50
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = NO
+DOT_MULTI_TARGETS      = NO
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
diff --git a/docsrc/planck.make b/docsrc/planck.make
new file mode 100644
index 0000000..0d0a462
--- /dev/null
+++ b/docsrc/planck.make
@@ -0,0 +1,20 @@
+PKG:=docsrc
+
+docsrc_idx: $(DOCDIR)_mkdir
+	cp $(SRCROOT)/docsrc/index_code.html $(DOCDIR)/index.html
+
+docsrc_code_doc: $(DOCDIR)_mkdir docsrc_idx
+	cd $(SRCROOT)/docsrc; \
+	for i in c_utils libfftpack libsharp; do \
+	  doxygen $${i}.dox; \
+	  rm -rf $(DOCDIR)/$${i}; mv htmldoc $(DOCDIR)/$${i}; \
+	done; \
+	rm *.tag;
+
+docsrc_clean:
+	cd $(SRCROOT)/docsrc; \
+	rm -f *.tag
+	cd $(SRCROOT)/docsrc; \
+	rm -rf htmldoc
+
+doc: docsrc_code_doc
diff --git a/libfftpack/README b/libfftpack/README
new file mode 100644
index 0000000..2c7e7cb
--- /dev/null
+++ b/libfftpack/README
@@ -0,0 +1,34 @@
+ls_fft description:
+
+This package is intended to calculate one-dimensional real or complex FFTs
+with high accuracy and good efficiency even for lengths containing large
+prime factors.
+The code is written in C, but a Fortran wrapper exists as well.
+
+Before any FFT is executed, a plan must be generated for it. Plan creation
+is designed to be fast, so that there is no significant overhead if the
+plan is only used once or a few times.
+
+The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the
+double precision incarnation by Hugh C. Pumphrey
+(http://www.netlib.org/fftpack/dp.tgz).
+
+I replaced the iterative sine and cosine calculations in radfg() and radbg()
+by an exact calculation, which slightly improves the transform accuracy for
+real FFTs with lengths containing large prime factors.
+
+Since FFTPACK becomes quite slow for FFT lengths with large prime factors
+(in the worst case of prime lengths it reaches O(n*n) complexity), I
+implemented Bluestein's algorithm, which computes a FFT of length n by
+several FFTs of length n2>=2*n-1 and a convolution. Since n2 can be chosen
+to be highly composite, this algorithm is more efficient if n has large
+prime factors. The longer FFTs themselves are then computed using the FFTPACK
+routines.
+Bluestein's algorithm was implemented according to the description at
+http://en.wikipedia.org/wiki/Bluestein's_FFT_algorithm.
+
+Thread-safety:
+All routines can be called concurrently; all information needed by ls_fft
+is stored in the plan variable. However, using the same plan variable on
+multiple threads simultaneously is not supported and will lead to data
+corruption.
diff --git a/libfftpack/bluestein.c b/libfftpack/bluestein.c
new file mode 100644
index 0000000..2e2005c
--- /dev/null
+++ b/libfftpack/bluestein.c
@@ -0,0 +1,173 @@
+/*
+ *  This file is part of libfftpack.
+ *
+ *  libfftpack is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libfftpack is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libfftpack; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  Copyright (C) 2005, 2006, 2007, 2008 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include "fftpack.h"
+#include "bluestein.h"
+
+/* returns the sum of all prime factors of n */
+size_t prime_factor_sum (size_t n)
+  {
+  size_t result=0,x,limit,tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { result+=2; n=tmp; }
+
+  limit=(size_t)sqrt(n+0.01);
+  for (x=3; x<=limit; x+=2)
+  while ((tmp=(n/x))*x==n)
+    {
+    result+=x;
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) result+=n;
+
+  return result;
+  }
+
+/* returns the smallest composite of 2, 3 and 5 which is >= n */
+static size_t good_size(size_t n)
+  {
+  size_t f2, f23, f235, bestfac=2*n;
+  if (n<=6) return n;
+
+  for (f2=1; f2<bestfac; f2*=2)
+    for (f23=f2; f23<bestfac; f23*=3)
+      for (f235=f23; f235<bestfac; f235*=5)
+        if (f235>=n) bestfac=f235;
+  return bestfac;
+  }
+
+void bluestein_i (size_t n, double **tstorage, size_t *worksize)
+  {
+  static const double pi=3.14159265358979323846;
+  size_t n2=good_size(n*2-1);
+  size_t m, coeff;
+  double angle, xn2;
+  double *bk, *bkf, *work;
+  double pibyn=pi/n;
+  *worksize=2+2*n+8*n2+16;
+  *tstorage = RALLOC(double,2+2*n+8*n2+16);
+  ((size_t *)(*tstorage))[0]=n2;
+  bk  = *tstorage+2;
+  bkf = *tstorage+2+2*n;
+  work= *tstorage+2+2*(n+n2);
+
+/* initialize b_k */
+  bk[0] = 1;
+  bk[1] = 0;
+
+  coeff=0;
+  for (m=1; m<n; ++m)
+    {
+    coeff+=2*m-1;
+    if (coeff>=2*n) coeff-=2*n;
+    angle = pibyn*coeff;
+    bk[2*m] = cos(angle);
+    bk[2*m+1] = sin(angle);
+    }
+
+/* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
+  xn2 = 1./n2;
+  bkf[0] = bk[0]*xn2;
+  bkf[1] = bk[1]*xn2;
+  for (m=2; m<2*n; m+=2)
+    {
+    bkf[m]   = bkf[2*n2-m]   = bk[m]   *xn2;
+    bkf[m+1] = bkf[2*n2-m+1] = bk[m+1] *xn2;
+    }
+  for (m=2*n;m<=(2*n2-2*n+1);++m)
+    bkf[m]=0.;
+  cffti (n2,work);
+  cfftf (n2,bkf,work);
+  }
+
+void bluestein (size_t n, double *data, double *tstorage, int isign)
+  {
+  size_t n2=*((size_t *)tstorage);
+  size_t m;
+  double *bk, *bkf, *akf, *work;
+  bk  = tstorage+2;
+  bkf = tstorage+2+2*n;
+  work= tstorage+2+2*(n+n2);
+  akf = tstorage+2+2*n+6*n2+16;
+
+/* initialize a_k and FFT it */
+  if (isign>0)
+    for (m=0; m<2*n; m+=2)
+      {
+      akf[m]   = data[m]*bk[m]   - data[m+1]*bk[m+1];
+      akf[m+1] = data[m]*bk[m+1] + data[m+1]*bk[m];
+      }
+  else
+    for (m=0; m<2*n; m+=2)
+      {
+      akf[m]   = data[m]*bk[m]   + data[m+1]*bk[m+1];
+      akf[m+1] =-data[m]*bk[m+1] + data[m+1]*bk[m];
+      }
+  for (m=2*n; m<2*n2; ++m)
+    akf[m]=0;
+
+  cfftf (n2,akf,work);
+
+/* do the convolution */
+  if (isign>0)
+    for (m=0; m<2*n2; m+=2)
+      {
+      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+  else
+    for (m=0; m<2*n2; m+=2)
+      {
+      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+
+
+/* inverse FFT */
+  cfftb (n2,akf,work);
+
+/* multiply by b_k* */
+  if (isign>0)
+    for (m=0; m<2*n; m+=2)
+      {
+      data[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
+      data[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  else
+    for (m=0; m<2*n; m+=2)
+      {
+      data[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
+      data[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  }
diff --git a/libfftpack/bluestein.h b/libfftpack/bluestein.h
new file mode 100644
index 0000000..91e5b28
--- /dev/null
+++ b/libfftpack/bluestein.h
@@ -0,0 +1,48 @@
+/*
+ *  This file is part of libfftpack.
+ *
+ *  libfftpack is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libfftpack is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libfftpack; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  Copyright (C) 2005 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_BLUESTEIN_H
+#define PLANCK_BLUESTEIN_H
+
+#include "c_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+size_t prime_factor_sum (size_t n);
+
+void bluestein_i (size_t n, double **tstorage, size_t *worksize);
+void bluestein (size_t n, double *data, double *tstorage, int isign);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libfftpack/fftpack.c b/libfftpack/fftpack.c
new file mode 100644
index 0000000..6d09d06
--- /dev/null
+++ b/libfftpack/fftpack.c
@@ -0,0 +1,833 @@
+/*
+ *  This file is part of libfftpack.
+ *
+ *  libfftpack is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libfftpack is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libfftpack; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+  fftpack.c : A set of FFT routines in C.
+  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
+  (Version 4, 1985).
+
+  C port by Martin Reinecke (2010)
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fftpack.h"
+
+#define WA(x,i) wa[(i)+(x)*ido]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define PM(a,b,c,d) { a=c+d; b=c-d; }
+#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
+#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
+#define SCALEC(a,b) { a.r*=b; a.i*=b; }
+#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+/* (a+ib) = conj(c+id) * (e+if) */
+#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
+
+typedef struct {
+  double r,i;
+} cmplx;
+
+#define CONCAT(a,b) a ## b
+
+#define X(arg) CONCAT(passb,arg)
+#define BACKWARD
+#include "fftpack_inc.c"
+#undef BACKWARD
+#undef X
+
+#define X(arg) CONCAT(passf,arg)
+#include "fftpack_inc.c"
+#undef X
+
+#undef CC
+#undef CH
+#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
+
+static void radf2 (size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=2;
+  size_t i, k, ic;
+  double ti2, tr2;
+
+  for (k=0; k<l1; k++)
+    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
+  if ((ido&1)==0)
+    for (k=0; k<l1; k++)
+      {
+      CH(    0,1,k) = -CC(ido-1,k,1);
+      CH(ido-1,0,k) =  CC(ido-1,k,0);
+      }
+  if (ido<=2) return;
+  for (k=0; k<l1; k++)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
+      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
+      }
+  }
+
+static void radf3(size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+  size_t i, k, ic;
+  double ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3;
+
+  for (k=0; k<l1; k++)
+    {
+    cr2=CC(0,k,1)+CC(0,k,2);
+    CH(0,0,k) = CC(0,k,0)+cr2;
+    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
+    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
+    }
+  if (ido==1) return;
+  for (k=0; k<l1; k++)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      cr2=dr2+dr3;
+      ci2=di2+di3;
+      CH(i-1,0,k) = CC(i-1,k,0)+cr2;
+      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
+      tr2 = CC(i-1,k,0)+taur*cr2;
+      ti2 = CC(i  ,k,0)+taur*ci2;
+      tr3 = taui*(di2-di3);
+      ti3 = taui*(dr3-dr2);
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2)
+      }
+  }
+
+static void radf4(size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=4;
+  static const double hsqt2=0.70710678118654752440;
+  size_t i, k, ic;
+  double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+
+  for (k=0; k<l1; k++)
+    {
+    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
+    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
+    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
+    }
+  if ((ido&1)==0)
+    for (k=0; k<l1; k++)
+      {
+      ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
+      tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
+      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
+      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
+      }
+  if (ido<=2) return;
+  for (k=0; k<l1; k++)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      PM(tr1,tr4,cr4,cr2)
+      PM(ti1,ti4,ci2,ci4)
+      PM(tr2,tr3,CC(i-1,k,0),cr3)
+      PM(ti2,ti3,CC(i  ,k,0),ci3)
+      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
+      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
+      }
+  }
+
+static void radf5(size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+  size_t i, k, ic;
+  double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
+         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+
+  for (k=0; k<l1; k++)
+    {
+    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
+    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
+    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
+    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
+    CH(0,2,k)=ti11*ci5+ti12*ci4;
+    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
+    CH(0,4,k)=ti12*ci5-ti11*ci4;
+    }
+  if (ido==1) return;
+  for (k=0; k<l1;++k)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
+      PM(cr2,ci5,dr5,dr2)
+      PM(ci2,cr5,di2,di5)
+      PM(cr3,ci4,dr4,dr3)
+      PM(ci3,cr4,di3,di4)
+      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
+      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
+      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
+      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
+      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
+      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
+      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
+      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
+      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
+      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
+      }
+  }
+
+#undef CH
+#undef CC
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+static void radfg(size_t ido, size_t ip, size_t l1, size_t idl1,
+  double *cc, double *ch, const double *wa)
+  {
+  const size_t cdim=ip;
+  static const double twopi=6.28318530717958647692;
+  size_t idij, ipph, i, j, k, l, j2, ic, jc, lc, ik;
+  double ai1, ai2, ar1, ar2, arg;
+  double *csarr;
+  size_t aidx;
+
+  ipph=(ip+1)/ 2;
+  if(ido!=1)
+    {
+    memcpy(ch,cc,idl1*sizeof(double));
+
+    for(j=1; j<ip; j++)
+      for(k=0; k<l1; k++)
+        {
+        CH(0,k,j)=C1(0,k,j);
+        idij=(j-1)*ido+1;
+        for(i=2; i<ido; i+=2,idij+=2)
+          MULPM(CH(i-1,k,j),CH(i,k,j),wa[idij-1],wa[idij],C1(i-1,k,j),C1(i,k,j))
+        }
+
+    for(j=1,jc=ip-1; j<ipph; j++,jc--)
+      for(k=0; k<l1; k++)
+        for(i=2; i<ido; i+=2)
+          {
+          PM(C1(i-1,k,j),C1(i  ,k,jc),CH(i-1,k,jc),CH(i-1,k,j ))
+          PM(C1(i  ,k,j),C1(i-1,k,jc),CH(i  ,k,j ),CH(i  ,k,jc))
+          }
+    }
+  else
+    memcpy(cc,ch,idl1*sizeof(double));
+
+  for(j=1,jc=ip-1; j<ipph; j++,jc--)
+    for(k=0; k<l1; k++)
+      PM(C1(0,k,j),C1(0,k,jc),CH(0,k,jc),CH(0,k,j))
+
+  csarr=RALLOC(double,2*ip);
+  arg=twopi / ip;
+  csarr[0]=1.;
+  csarr[1]=0.;
+  csarr[2]=csarr[2*ip-2]=cos(arg);
+  csarr[3]=sin(arg); csarr[2*ip-1]=-csarr[3];
+  for (i=2; i<=ip/2; ++i)
+    {
+    csarr[2*i]=csarr[2*ip-2*i]=cos(i*arg);
+    csarr[2*i+1]=sin(i*arg);
+    csarr[2*ip-2*i+1]=-csarr[2*i+1];
+    }
+  for(l=1,lc=ip-1; l<ipph; l++,lc--)
+    {
+    ar1=csarr[2*l];
+    ai1=csarr[2*l+1];
+    for(ik=0; ik<idl1; ik++)
+      {
+      CH2(ik,l)=C2(ik,0)+ar1*C2(ik,1);
+      CH2(ik,lc)=ai1*C2(ik,ip-1);
+      }
+    aidx=2*l;
+    for(j=2,jc=ip-2; j<ipph; j++,jc--)
+      {
+      aidx+=2*l;
+      if (aidx>=2*ip) aidx-=2*ip;
+      ar2=csarr[aidx];
+      ai2=csarr[aidx+1];
+      for(ik=0; ik<idl1; ik++)
+        {
+        CH2(ik,l )+=ar2*C2(ik,j );
+        CH2(ik,lc)+=ai2*C2(ik,jc);
+        }
+      }
+    }
+  DEALLOC(csarr);
+
+  for(j=1; j<ipph; j++)
+    for(ik=0; ik<idl1; ik++)
+      CH2(ik,0)+=C2(ik,j);
+
+  for(k=0; k<l1; k++)
+    memcpy(&CC(0,0,k),&CH(0,k,0),ido*sizeof(double));
+  for(j=1; j<ipph; j++)
+    {
+    jc=ip-j;
+    j2=2*j;
+    for(k=0; k<l1; k++)
+      {
+      CC(ido-1,j2-1,k) = CH(0,k,j );
+      CC(0    ,j2  ,k) = CH(0,k,jc);
+      }
+    }
+  if(ido==1) return;
+
+  for(j=1; j<ipph; j++)
+    {
+    jc=ip-j;
+    j2=2*j;
+    for(k=0; k<l1; k++)
+      for(i=2; i<ido; i+=2)
+        {
+        ic=ido-i;
+        PM (CC(i-1,j2,k),CC(ic-1,j2-1,k),CH(i-1,k,j ),CH(i-1,k,jc))
+        PM (CC(i  ,j2,k),CC(ic  ,j2-1,k),CH(i  ,k,jc),CH(i  ,k,j ))
+        }
+    }
+  }
+
+#undef CC
+#undef CH
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+
+static void radb2(size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=2;
+  size_t i, k, ic;
+  double ti2, tr2;
+
+  for (k=0; k<l1; k++)
+    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
+  if ((ido&1)==0)
+    for (k=0; k<l1; k++)
+      {
+      CH(ido-1,k,0) =  2*CC(ido-1,0,k);
+      CH(ido-1,k,1) = -2*CC(0    ,1,k);
+      }
+  if (ido<=2) return;
+  for (k=0; k<l1;++k)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
+      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
+      }
+  }
+
+static void radb3(size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+  size_t i, k, ic;
+  double ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+
+  for (k=0; k<l1; k++)
+    {
+    tr2=2*CC(ido-1,1,k);
+    cr2=CC(0,0,k)+taur*tr2;
+    CH(0,k,0)=CC(0,0,k)+tr2;
+    ci3=2*taui*CC(0,2,k);
+    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
+    }
+  if (ido==1) return;
+  for (k=0; k<l1; k++)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      tr2=CC(i-1,2,k)+CC(ic-1,1,k);
+      ti2=CC(i  ,2,k)-CC(ic  ,1,k);
+      cr2=CC(i-1,0,k)+taur*tr2;
+      ci2=CC(i  ,0,k)+taur*ti2;
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2;
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
+      cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));
+      ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
+      PM(dr3,dr2,cr2,ci3)
+      PM(di2,di3,ci2,cr3)
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      }
+  }
+
+static void radb4(size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=4;
+  static const double sqrt2=1.41421356237309504880;
+  size_t i, k, ic;
+  double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+
+  for (k=0; k<l1; k++)
+    {
+    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
+    tr3=2*CC(ido-1,1,k);
+    tr4=2*CC(0,2,k);
+    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
+    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
+    }
+  if ((ido&1)==0)
+    for (k=0; k<l1; k++)
+      {
+      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
+      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
+      CH(ido-1,k,0)=tr2+tr2;
+      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
+      CH(ido-1,k,2)=ti2+ti2;
+      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
+      }
+  if (ido<=2) return;
+  for (k=0; k<l1;++k)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
+      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
+      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
+      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
+      PM (CH(i-1,k,0),cr3,tr2,tr3)
+      PM (CH(i  ,k,0),ci3,ti2,ti3)
+      PM (cr4,cr2,tr1,tr4)
+      PM (ci2,ci4,ti1,ti4)
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
+      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
+      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
+      }
+  }
+
+static void radb5(size_t ido, size_t l1, const double *cc, double *ch,
+  const double *wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+  size_t i, k, ic;
+  double ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4,
+         ti2, ti3, ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+
+  for (k=0; k<l1; k++)
+    {
+    ti5=2*CC(0,2,k);
+    ti4=2*CC(0,4,k);
+    tr2=2*CC(ido-1,1,k);
+    tr3=2*CC(ido-1,3,k);
+    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
+    cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
+    cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
+    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
+    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
+    }
+  if (ido==1) return;
+  for (k=0; k<l1;++k)
+    for (i=2; i<ido; i+=2)
+      {
+      ic=ido-i;
+      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
+      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
+      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
+      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
+      cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
+      ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
+      cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
+      ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
+      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
+      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+      PM(dr4,dr3,cr3,ci4)
+      PM(di3,di4,ci3,cr4)
+      PM(dr5,dr2,cr2,ci5)
+      PM(di2,di5,ci2,cr5)
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
+      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
+      }
+  }
+
+static void radbg(size_t ido, size_t ip, size_t l1, size_t idl1,
+  double *cc, double *ch, const double *wa)
+  {
+  const size_t cdim=ip;
+  static const double twopi=6.28318530717958647692;
+  size_t idij, ipph, i, j, k, l, j2, ic, jc, lc, ik;
+  double ai1, ai2, ar1, ar2, arg;
+  double *csarr;
+  size_t aidx;
+
+  ipph=(ip+1)/ 2;
+  for(k=0; k<l1; k++)
+    memcpy(&CH(0,k,0),&CC(0,0,k),ido*sizeof(double));
+  for(j=1; j<ipph; j++)
+    {
+    jc=ip-j;
+    j2=2*j;
+    for(k=0; k<l1; k++)
+      {
+      CH(0,k,j )=2*CC(ido-1,j2-1,k);
+      CH(0,k,jc)=2*CC(0    ,j2  ,k);
+      }
+    }
+
+  if(ido!=1)
+    for(j=1,jc=ip-1; j<ipph; j++,jc--)
+      for(k=0; k<l1; k++)
+        for(i=2; i<ido; i+=2)
+          {
+          ic=ido-i;
+          PM (CH(i-1,k,j ),CH(i-1,k,jc),CC(i-1,2*j,k),CC(ic-1,2*j-1,k))
+          PM (CH(i  ,k,jc),CH(i  ,k,j ),CC(i  ,2*j,k),CC(ic  ,2*j-1,k))
+          }
+
+  csarr=RALLOC(double,2*ip);
+  arg=twopi/ip;
+  csarr[0]=1.;
+  csarr[1]=0.;
+  csarr[2]=csarr[2*ip-2]=cos(arg);
+  csarr[3]=sin(arg); csarr[2*ip-1]=-csarr[3];
+  for (i=2; i<=ip/2; ++i)
+    {
+    csarr[2*i]=csarr[2*ip-2*i]=cos(i*arg);
+    csarr[2*i+1]=sin(i*arg);
+    csarr[2*ip-2*i+1]=-csarr[2*i+1];
+    }
+  for(l=1; l<ipph; l++)
+    {
+    lc=ip-l;
+    ar1=csarr[2*l];
+    ai1=csarr[2*l+1];
+    for(ik=0; ik<idl1; ik++)
+      {
+      C2(ik,l)=CH2(ik,0)+ar1*CH2(ik,1);
+      C2(ik,lc)=ai1*CH2(ik,ip-1);
+      }
+    aidx=2*l;
+    for(j=2; j<ipph; j++)
+      {
+      jc=ip-j;
+      aidx+=2*l;
+      if (aidx>=2*ip) aidx-=2*ip;
+      ar2=csarr[aidx];
+      ai2=csarr[aidx+1];
+      for(ik=0; ik<idl1; ik++)
+        {
+        C2(ik,l )+=ar2*CH2(ik,j );
+        C2(ik,lc)+=ai2*CH2(ik,jc);
+        }
+      }
+    }
+  DEALLOC(csarr);
+
+  for(j=1; j<ipph; j++)
+    for(ik=0; ik<idl1; ik++)
+      CH2(ik,0)+=CH2(ik,j);
+
+  for(j=1,jc=ip-1; j<ipph; j++,jc--)
+    for(k=0; k<l1; k++)
+      PM (CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc))
+
+  if(ido==1)
+    return;
+  for(j=1,jc=ip-1; j<ipph; j++,jc--)
+    for(k=0; k<l1; k++)
+      for(i=2; i<ido; i+=2)
+        {
+        PM (CH(i-1,k,jc),CH(i-1,k,j ),C1(i-1,k,j),C1(i  ,k,jc))
+        PM (CH(i  ,k,j ),CH(i  ,k,jc),C1(i  ,k,j),C1(i-1,k,jc))
+        }
+  memcpy(cc,ch,idl1*sizeof(double));
+
+  for(j=1; j<ip; j++)
+    for(k=0; k<l1; k++)
+      {
+      C1(0,k,j)=CH(0,k,j);
+      idij=(j-1)*ido+1;
+      for(i=2; i<ido; i+=2,idij+=2)
+        MULPM (C1(i,k,j),C1(i-1,k,j),wa[idij-1],wa[idij],CH(i,k,j),CH(i-1,k,j))
+      }
+  }
+
+#undef CC
+#undef CH
+#undef PM
+#undef MULPM
+
+
+/*----------------------------------------------------------------------
+   cfftf1, cfftb1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
+  ----------------------------------------------------------------------*/
+
+static void cfft1(size_t n, cmplx c[], cmplx ch[], const cmplx wa[],
+  const size_t ifac[], int isign)
+  {
+  size_t k1, l1=1, nf=ifac[1], iw=0;
+  cmplx *p1=c, *p2=ch;
+
+  for(k1=0; k1<nf; k1++)
+    {
+    size_t ip=ifac[k1+2];
+    size_t l2=ip*l1;
+    size_t ido = n/l2;
+    if(ip==4)
+      (isign>0) ? passb4(ido, l1, p1, p2, wa+iw)
+                : passf4(ido, l1, p1, p2, wa+iw);
+    else if(ip==2)
+      (isign>0) ? passb2(ido, l1, p1, p2, wa+iw)
+                : passf2(ido, l1, p1, p2, wa+iw);
+    else if(ip==3)
+      (isign>0) ? passb3(ido, l1, p1, p2, wa+iw)
+                : passf3(ido, l1, p1, p2, wa+iw);
+    else if(ip==5)
+      (isign>0) ? passb5(ido, l1, p1, p2, wa+iw)
+                : passf5(ido, l1, p1, p2, wa+iw);
+    else if(ip==6)
+      (isign>0) ? passb6(ido, l1, p1, p2, wa+iw)
+                : passf6(ido, l1, p1, p2, wa+iw);
+    else
+      (isign>0) ? passbg(ido, ip, l1, p1, p2, wa+iw)
+                : passfg(ido, ip, l1, p1, p2, wa+iw);
+    SWAP(p1,p2,cmplx *);
+    l1=l2;
+    iw+=(ip-1)*ido;
+    }
+  if (p1!=c)
+    memcpy (c,p1,n*sizeof(cmplx));
+  }
+
+void cfftf(size_t n, double c[], double wsave[])
+  {
+  if (n!=1)
+    cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n),
+          (size_t*)(wsave+4*n),-1);
+  }
+
+void cfftb(size_t n, double c[], double wsave[])
+  {
+  if (n!=1)
+    cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n),
+          (size_t*)(wsave+4*n),+1);
+  }
+
+static void factorize (size_t n, const size_t *pf, size_t npf, size_t *ifac)
+  {
+  size_t nl=n, nf=0, ntry=0, j=0, i;
+
+startloop:
+  j++;
+  ntry = (j<=npf) ? pf[j-1] : ntry+2;
+  do
+    {
+    size_t nq=nl / ntry;
+    size_t nr=nl-ntry*nq;
+    if (nr!=0)
+      goto startloop;
+    nf++;
+    ifac[nf+1]=ntry;
+    nl=nq;
+    if ((ntry==2) && (nf!=1))
+      {
+      for (i=nf+1; i>2; --i)
+        ifac[i]=ifac[i-1];
+      ifac[2]=2;
+      }
+    }
+  while(nl!=1);
+  ifac[0]=n;
+  ifac[1]=nf;
+  }
+
+static void cffti1(size_t n, double wa[], size_t ifac[])
+  {
+  static const size_t ntryh[5]={4,6,3,2,5};
+  static const double twopi=6.28318530717958647692;
+  size_t j, k, fi;
+
+  double argh=twopi/n;
+  size_t i=0, l1=1;
+  factorize (n,ntryh,5,ifac);
+  for(k=1; k<=ifac[1]; k++)
+    {
+    size_t ip=ifac[k+1];
+    size_t ido=n/(l1*ip);
+    for(j=1; j<ip; j++)
+      {
+      size_t is = i;
+      double argld=j*l1*argh;
+      wa[i  ]=1;
+      wa[i+1]=0;
+      for(fi=1; fi<=ido; fi++)
+        {
+        double arg=fi*argld;
+        i+=2;
+        wa[i  ]=cos(arg);
+        wa[i+1]=sin(arg);
+        }
+      if(ip>6)
+        {
+        wa[is  ]=wa[i  ];
+        wa[is+1]=wa[i+1];
+        }
+      }
+    l1*=ip;
+    }
+  }
+
+void cffti(size_t n, double wsave[])
+  { if (n!=1) cffti1(n, wsave+2*n,(size_t*)(wsave+4*n)); }
+
+
+/*----------------------------------------------------------------------
+   rfftf1, rfftb1, rfftf, rfftb, rffti1, rffti. Real FFTs.
+  ----------------------------------------------------------------------*/
+
+static void rfftf1(size_t n, double c[], double ch[], const double wa[],
+  const size_t ifac[])
+  {
+  size_t k1, l1=n, nf=ifac[1], iw=n-1;
+  double *p1=ch, *p2=c;
+
+  for(k1=1; k1<=nf;++k1)
+    {
+    size_t ip=ifac[nf-k1+2];
+    size_t ido=n / l1;
+    l1 /= ip;
+    iw-=(ip-1)*ido;
+    SWAP (p1,p2,double *);
+    if(ip==4)
+      radf4(ido, l1, p1, p2, wa+iw);
+    else if(ip==2)
+      radf2(ido, l1, p1, p2, wa+iw);
+    else if(ip==3)
+      radf3(ido, l1, p1, p2, wa+iw);
+    else if(ip==5)
+      radf5(ido, l1, p1, p2, wa+iw);
+    else
+      {
+      if (ido==1)
+        SWAP (p1,p2,double *);
+      radfg(ido, ip, l1, ido*l1, p1, p2, wa+iw);
+      SWAP (p1,p2,double *);
+      }
+    }
+  if (p1==c)
+    memcpy (c,ch,n*sizeof(double));
+  }
+
+static void rfftb1(size_t n, double c[], double ch[], const double wa[],
+  const size_t ifac[])
+  {
+  size_t k1, l1=1, nf=ifac[1], iw=0;
+  double *p1=c, *p2=ch;
+
+  for(k1=1; k1<=nf; k1++)
+    {
+    size_t ip = ifac[k1+1],
+           ido= n/(ip*l1);
+    if(ip==4)
+      radb4(ido, l1, p1, p2, wa+iw);
+    else if(ip==2)
+      radb2(ido, l1, p1, p2, wa+iw);
+    else if(ip==3)
+      radb3(ido, l1, p1, p2, wa+iw);
+    else if(ip==5)
+      radb5(ido, l1, p1, p2, wa+iw);
+    else
+      {
+      radbg(ido, ip, l1, ido*l1, p1, p2, wa+iw);
+      if (ido!=1)
+        SWAP (p1,p2,double *);
+      }
+    SWAP (p1,p2,double *);
+    l1*=ip;
+    iw+=(ip-1)*ido;
+    }
+  if (p1!=c)
+    memcpy (c,ch,n*sizeof(double));
+  }
+
+void rfftf(size_t n, double r[], double wsave[])
+  { if(n!=1) rfftf1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); }
+
+void rfftb(size_t n, double r[], double wsave[])
+  { if(n!=1) rfftb1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); }
+
+static void rffti1(size_t n, double wa[], size_t ifac[])
+  {
+  static const size_t ntryh[4]={4,2,3,5};
+  static const double twopi=6.28318530717958647692;
+  size_t i, j, k, fi;
+
+  double argh=twopi/n;
+  size_t is=0, l1=1;
+  factorize (n,ntryh,4,ifac);
+  for (k=1; k<ifac[1]; k++)
+    {
+    size_t ip=ifac[k+1],
+           ido=n/(l1*ip);
+    for (j=1; j<ip; ++j)
+      {
+      double argld=j*l1*argh;
+      for(i=is,fi=1; i<=ido+is-3; i+=2,++fi)
+        {
+        double arg=fi*argld;
+        wa[i  ]=cos(arg);
+        wa[i+1]=sin(arg);
+        }
+      is+=ido;
+      }
+    l1*=ip;
+    }
+  }
+
+void rffti(size_t n, double wsave[])
+  { if (n!=1) rffti1(n, wsave+n,(size_t*)(wsave+2*n)); }
diff --git a/libfftpack/fftpack.h b/libfftpack/fftpack.h
new file mode 100644
index 0000000..6a2e96e
--- /dev/null
+++ b/libfftpack/fftpack.h
@@ -0,0 +1,64 @@
+/*
+ *  This file is part of libfftpack.
+ *
+ *  libfftpack is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libfftpack is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libfftpack; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+  fftpack.h : function declarations for fftpack.c
+  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
+  (Version 4, 1985).
+
+  Pekka Janhunen 23.2.1995
+
+  (reformatted by joerg arndt)
+
+  reformatted and slightly enhanced by Martin Reinecke (2004)
+ */
+
+#ifndef PLANCK_FFTPACK_H
+#define PLANCK_FFTPACK_H
+
+#include "c_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! forward complex transform */
+void cfftf(size_t N, double complex_data[], double wrk[]);
+/*! backward complex transform */
+void cfftb(size_t N, double complex_data[], double wrk[]);
+/*! initializer for complex transforms */
+void cffti(size_t N, double wrk[]);
+
+/*! forward real transform */
+void rfftf(size_t N, double data[], double wrk[]);
+/*! backward real transform */
+void rfftb(size_t N, double data[], double wrk[]);
+/*! initializer for real transforms */
+void rffti(size_t N, double wrk[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libfftpack/fftpack_inc.c b/libfftpack/fftpack_inc.c
new file mode 100644
index 0000000..55d0ac5
--- /dev/null
+++ b/libfftpack/fftpack_inc.c
@@ -0,0 +1,306 @@
+/*
+ *  This file is part of libfftpack.
+ *
+ *  libfftpack is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libfftpack is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libfftpack; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+  fftpack.c : A set of FFT routines in C.
+  Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber
+  (Version 4, 1985).
+
+  C port by Martin Reinecke (2010)
+ */
+
+#ifdef BACKWARD
+#define PSIGN +
+#define PMSIGNC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
+/* a = b*c */
+#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
+#else
+#define PSIGN -
+#define PMSIGNC(a,b,c,d) { a.r=c.r-d.r; a.i=c.i-d.i; b.r=c.r+d.r; b.i=c.i+d.i; }
+/* a = conj(b)*c */
+#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
+#endif
+
+static void X(2) (size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
+  const cmplx *wa)
+  {
+  const size_t cdim=2;
+  size_t k,i;
+  cmplx t;
+  if (ido==1)
+    for (k=0;k<l1;++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (k=0;k<l1;++k)
+      for (i=0;i<ido;++i)
+        {
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        MULPMSIGNC (CH(i,k,1),WA(0,i),t)
+        }
+  }
+
+static void X(3)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
+  const cmplx *wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui= PSIGN 0.86602540378443864676;
+  size_t i, k;
+  cmplx c2, c3, d2, d3, t2;
+
+  if (ido==1)
+    for (k=0; k<l1; ++k)
+      {
+      PMC (t2,c3,CC(0,1,k),CC(0,2,k))
+      ADDC (CH(0,k,0),t2,CC(0,0,k))
+      SCALEC(t2,taur)
+      ADDC(c2,CC(0,0,k),t2)
+      SCALEC(c3,taui)
+      CONJFLIPC(c3)
+      PMC(CH(0,k,1),CH(0,k,2),c2,c3)
+      }
+  else
+    for (k=0; k<l1; ++k)
+      for (i=0; i<ido; ++i)
+        {
+        PMC (t2,c3,CC(i,1,k),CC(i,2,k))
+        ADDC (CH(i,k,0),t2,CC(i,0,k))
+        SCALEC(t2,taur)
+        ADDC(c2,CC(i,0,k),t2)
+        SCALEC(c3,taui)
+        CONJFLIPC(c3)
+        PMC(d2,d3,c2,c3)
+        MULPMSIGNC(CH(i,k,1),WA(0,i),d2)
+        MULPMSIGNC(CH(i,k,2),WA(1,i),d3)
+        }
+  }
+
+static void X(4)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
+  const cmplx *wa)
+  {
+  const size_t cdim=4;
+  size_t i, k;
+  cmplx c2, c3, c4, t1, t2, t3, t4;
+
+  if (ido==1)
+    for (k=0; k<l1; ++k)
+      {
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      CONJFLIPC(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMSIGNC (CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (k=0; k<l1; ++k)
+      for (i=0; i<ido; ++i)
+        {
+        PMC(t2,t1,CC(i,0,k),CC(i,2,k))
+        PMC(t3,t4,CC(i,1,k),CC(i,3,k))
+        CONJFLIPC(t4)
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMSIGNC (c2,c4,t1,t4)
+        MULPMSIGNC (CH(i,k,1),WA(0,i),c2)
+        MULPMSIGNC (CH(i,k,2),WA(1,i),c3)
+        MULPMSIGNC (CH(i,k,3),WA(2,i),c4)
+        }
+  }
+
+static void X(5)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
+  const cmplx *wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241,
+                      ti11= PSIGN 0.95105651629515357212,
+                      tr12=-0.8090169943749474241,
+                      ti12= PSIGN 0.58778525229247312917;
+  size_t i, k;
+  cmplx c2, c3, c4, c5, d2, d3, d4, d5, t2, t3, t4, t5;
+
+  if (ido==1)
+    for (k=0; k<l1; ++k)
+      {
+      PMC (t2,t5,CC(0,1,k),CC(0,4,k))
+      PMC (t3,t4,CC(0,2,k),CC(0,3,k))
+      CH(0,k,0).r=CC(0,0,k).r+t2.r+t3.r;
+      CH(0,k,0).i=CC(0,0,k).i+t2.i+t3.i;
+      c2.r=CC(0,0,k).r+tr11*t2.r+tr12*t3.r;
+      c2.i=CC(0,0,k).i+tr11*t2.i+tr12*t3.i;
+      c3.r=CC(0,0,k).r+tr12*t2.r+tr11*t3.r;
+      c3.i=CC(0,0,k).i+tr12*t2.i+tr11*t3.i;
+      c5.r=ti11*t5.r+ti12*t4.r;
+      c5.i=ti11*t5.i+ti12*t4.i;
+      c4.r=ti12*t5.r-ti11*t4.r;
+      c4.i=ti12*t5.i-ti11*t4.i;
+      CONJFLIPC(c5)
+      PMC(CH(0,k,1),CH(0,k,4),c2,c5)
+      CONJFLIPC(c4)
+      PMC(CH(0,k,2),CH(0,k,3),c3,c4)
+      }
+  else
+    for (k=0; k<l1; ++k)
+      for (i=0; i<ido; ++i)
+        {
+        PMC (t2,t5,CC(i,1,k),CC(i,4,k))
+        PMC (t3,t4,CC(i,2,k),CC(i,3,k))
+        CH(i,k,0).r=CC(i,0,k).r+t2.r+t3.r;
+        CH(i,k,0).i=CC(i,0,k).i+t2.i+t3.i;
+        c2.r=CC(i,0,k).r+tr11*t2.r+tr12*t3.r;
+        c2.i=CC(i,0,k).i+tr11*t2.i+tr12*t3.i;
+        c3.r=CC(i,0,k).r+tr12*t2.r+tr11*t3.r;
+        c3.i=CC(i,0,k).i+tr12*t2.i+tr11*t3.i;
+        c5.r=ti11*t5.r+ti12*t4.r;
+        c5.i=ti11*t5.i+ti12*t4.i;
+        c4.r=ti12*t5.r-ti11*t4.r;
+        c4.i=ti12*t5.i-ti11*t4.i;
+        CONJFLIPC(c5)
+        PMC(d2,d5,c2,c5)
+        CONJFLIPC(c4)
+        PMC(d3,d4,c3,c4)
+        MULPMSIGNC (CH(i,k,1),WA(0,i),d2)
+        MULPMSIGNC (CH(i,k,2),WA(1,i),d3)
+        MULPMSIGNC (CH(i,k,3),WA(2,i),d4)
+        MULPMSIGNC (CH(i,k,4),WA(3,i),d5)
+        }
+  }
+
+static void X(6)(size_t ido, size_t l1, const cmplx *cc, cmplx *ch,
+  const cmplx *wa)
+  {
+  const size_t cdim=6;
+  static const double taui= PSIGN 0.86602540378443864676;
+  cmplx ta1,ta2,ta3,a0,a1,a2,tb1,tb2,tb3,b0,b1,b2,d1,d2,d3,d4,d5;
+  size_t i, k;
+
+  if (ido==1)
+    for (k=0; k<l1; ++k)
+      {
+      PMC(ta1,ta3,CC(0,2,k),CC(0,4,k))
+      ta2.r = CC(0,0,k).r - .5*ta1.r;
+      ta2.i = CC(0,0,k).i - .5*ta1.i;
+      SCALEC(ta3,taui)
+      ADDC(a0,CC(0,0,k),ta1)
+      CONJFLIPC(ta3)
+      PMC(a1,a2,ta2,ta3)
+      PMC(tb1,tb3,CC(0,5,k),CC(0,1,k))
+      tb2.r = CC(0,3,k).r - .5*tb1.r;
+      tb2.i = CC(0,3,k).i - .5*tb1.i;
+      SCALEC(tb3,taui)
+      ADDC(b0,CC(0,3,k),tb1)
+      CONJFLIPC(tb3)
+      PMC(b1,b2,tb2,tb3)
+      PMC(CH(0,k,0),CH(0,k,3),a0,b0)
+      PMC(CH(0,k,4),CH(0,k,1),a1,b1)
+      PMC(CH(0,k,2),CH(0,k,5),a2,b2)
+      }
+  else
+    for (k=0; k<l1; ++k)
+      for (i=0; i<ido; ++i)
+        {
+        PMC(ta1,ta3,CC(i,2,k),CC(i,4,k))
+        ta2.r = CC(i,0,k).r - .5*ta1.r;
+        ta2.i = CC(i,0,k).i - .5*ta1.i;
+        SCALEC(ta3,taui)
+        ADDC(a0,CC(i,0,k),ta1)
+        CONJFLIPC(ta3)
+        PMC(a1,a2,ta2,ta3)
+        PMC(tb1,tb3,CC(i,5,k),CC(i,1,k))
+        tb2.r = CC(i,3,k).r - .5*tb1.r;
+        tb2.i = CC(i,3,k).i - .5*tb1.i;
+        SCALEC(tb3,taui)
+        ADDC(b0,CC(i,3,k),tb1)
+        CONJFLIPC(tb3)
+        PMC(b1,b2,tb2,tb3)
+        PMC(CH(i,k,0),d3,a0,b0)
+        PMC(d4,d1,a1,b1)
+        PMC(d2,d5,a2,b2)
+        MULPMSIGNC (CH(i,k,1),WA(0,i),d1)
+        MULPMSIGNC (CH(i,k,2),WA(1,i),d2)
+        MULPMSIGNC (CH(i,k,3),WA(2,i),d3)
+        MULPMSIGNC (CH(i,k,4),WA(3,i),d4)
+        MULPMSIGNC (CH(i,k,5),WA(4,i),d5)
+        }
+  }
+
+static void X(g)(size_t ido, size_t ip, size_t l1, const cmplx *cc, cmplx *ch,
+  const cmplx *wa)
+  {
+  const size_t cdim=ip;
+  cmplx *tarr=RALLOC(cmplx,2*ip);
+  cmplx *ccl=tarr, *wal=tarr+ip;
+  size_t i,j,k,l,jc,lc;
+  size_t ipph = (ip+1)/2;
+
+  for (i=1; i<ip; ++i)
+    wal[i]=wa[ido*(i-1)];
+  for (k=0; k<l1; ++k)
+    for (i=0; i<ido; ++i)
+      {
+      cmplx s=CC(i,0,k);
+      ccl[0] = CC(i,0,k);
+      for(j=1,jc=ip-1; j<ipph; ++j,--jc)
+        {
+        PMC (ccl[j],ccl[jc],CC(i,j,k),CC(i,jc,k))
+        ADDC (s,s,ccl[j])
+        }
+      CH(i,k,0) = s;
+      for (j=1, jc=ip-1; j<=ipph; ++j,--jc)
+        {
+        cmplx abr=ccl[0], abi={0.,0.};
+        size_t iang=0;
+        for (l=1,lc=ip-1; l<ipph; ++l,--lc)
+          {
+          iang+=j;
+          if (iang>ip) iang-=ip;
+          abr.r += ccl[l ].r*wal[iang].r;
+          abr.i += ccl[l ].i*wal[iang].r;
+          abi.r += ccl[lc].r*wal[iang].i;
+          abi.i += ccl[lc].i*wal[iang].i;
+          }
+#ifndef BACKWARD
+          { abi.i=-abi.i; abi.r=-abi.r; }
+#endif
+        CONJFLIPC(abi)
+        PMC(CH(i,k,j),CH(i,k,jc),abr,abi)
+        }
+      }
+
+  DEALLOC(tarr);
+
+  if (ido==1) return;
+
+  for (j=1; j<ip; ++j)
+    for (k=0; k<l1; ++k)
+      {
+      size_t idij=(j-1)*ido+1;
+      for(i=1; i<ido; ++i, ++idij)
+        {
+        cmplx t=CH(i,k,j);
+        MULPMSIGNC (CH(i,k,j),wa[idij],t)
+        }
+      }
+  }
+
+#undef PSIGN
+#undef PMSIGNC
+#undef MULPMSIGNC
diff --git a/libfftpack/libfftpack.dox b/libfftpack/libfftpack.dox
new file mode 100644
index 0000000..9ed2362
--- /dev/null
+++ b/libfftpack/libfftpack.dox
@@ -0,0 +1,5 @@
+/*! \mainpage Libfftpack documentation
+  <ul>
+  <li>\ref fftgroup "Programming interface"
+  </ul>
+ */
diff --git a/libfftpack/ls_fft.c b/libfftpack/ls_fft.c
new file mode 100644
index 0000000..b1c0c96
--- /dev/null
+++ b/libfftpack/ls_fft.c
@@ -0,0 +1,291 @@
+/*
+ *  This file is part of libfftpack.
+ *
+ *  libfftpack is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libfftpack is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libfftpack; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  Copyright (C) 2005 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include "bluestein.h"
+#include "fftpack.h"
+#include "ls_fft.h"
+
+complex_plan make_complex_plan (size_t length)
+  {
+  complex_plan plan = RALLOC(complex_plan_i,1);
+  size_t pfsum = prime_factor_sum(length);
+  double comp1 = (double)(length*pfsum);
+  double comp2 = 2*3*length*log(3.*length);
+  comp2*=3.; /* fudge factor that appears to give good overall performance */
+  plan->length=length;
+  plan->bluestein = (comp2<comp1);
+  if (plan->bluestein)
+    bluestein_i (length,&(plan->work),&(plan->worksize));
+  else
+    {
+    plan->worksize=4*length+15;
+    plan->work=RALLOC(double,4*length+15);
+    cffti(length, plan->work);
+    }
+  return plan;
+  }
+
+complex_plan copy_complex_plan (complex_plan plan)
+  {
+  if (!plan) return NULL;
+  {
+  complex_plan newplan = RALLOC(complex_plan_i,1);
+  *newplan = *plan;
+  newplan->work=RALLOC(double,newplan->worksize);
+  memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize);
+  return newplan;
+  }
+  }
+
+void kill_complex_plan (complex_plan plan)
+  {
+  DEALLOC(plan->work);
+  DEALLOC(plan);
+  }
+
+void complex_plan_forward (complex_plan plan, double *data)
+  {
+  if (plan->bluestein)
+    bluestein (plan->length, data, plan->work, -1);
+  else
+    cfftf (plan->length, data, plan->work);
+  }
+
+void complex_plan_backward (complex_plan plan, double *data)
+  {
+  if (plan->bluestein)
+    bluestein (plan->length, data, plan->work, 1);
+  else
+    cfftb (plan->length, data, plan->work);
+  }
+
+
+real_plan make_real_plan (size_t length)
+  {
+  real_plan plan = RALLOC(real_plan_i,1);
+  size_t pfsum = prime_factor_sum(length);
+  double comp1 = .5*length*pfsum;
+  double comp2 = 2*3*length*log(3.*length);
+  comp2*=3; /* fudge factor that appears to give good overall performance */
+  plan->length=length;
+  plan->bluestein = (comp2<comp1);
+  if (plan->bluestein)
+    bluestein_i (length,&(plan->work),&(plan->worksize));
+  else
+    {
+    plan->worksize=2*length+15;
+    plan->work=RALLOC(double,2*length+15);
+    rffti(length, plan->work);
+    }
+  return plan;
+  }
+
+real_plan copy_real_plan (real_plan plan)
+  {
+  if (!plan) return NULL;
+  {
+  real_plan newplan = RALLOC(real_plan_i,1);
+  *newplan = *plan;
+  newplan->work=RALLOC(double,newplan->worksize);
+  memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize);
+  return newplan;
+  }
+  }
+
+void kill_real_plan (real_plan plan)
+  {
+  DEALLOC(plan->work);
+  DEALLOC(plan);
+  }
+
+void real_plan_forward_fftpack (real_plan plan, double *data)
+  {
+  if (plan->bluestein)
+    {
+    size_t m;
+    size_t n=plan->length;
+    double *tmp = RALLOC(double,2*n);
+    for (m=0; m<n; ++m)
+      {
+      tmp[2*m] = data[m];
+      tmp[2*m+1] = 0.;
+      }
+    bluestein(n,tmp,plan->work,-1);
+    data[0] = tmp[0];
+    memcpy (data+1, tmp+2, (n-1)*sizeof(double));
+    DEALLOC(tmp);
+    }
+  else
+    rfftf (plan->length, data, plan->work);
+  }
+
+static void fftpack2halfcomplex (double *data, size_t n)
+  {
+  size_t m;
+  double *tmp = RALLOC(double,n);
+  tmp[0]=data[0];
+  for (m=1; m<(n+1)/2; ++m)
+    {
+    tmp[m]=data[2*m-1];
+    tmp[n-m]=data[2*m];
+    }
+  if (!(n&1))
+    tmp[n/2]=data[n-1];
+  memcpy (data,tmp,n*sizeof(double));
+  DEALLOC(tmp);
+  }
+
+static void halfcomplex2fftpack (double *data, size_t n)
+  {
+  size_t m;
+  double *tmp = RALLOC(double,n);
+  tmp[0]=data[0];
+  for (m=1; m<(n+1)/2; ++m)
+    {
+    tmp[2*m-1]=data[m];
+    tmp[2*m]=data[n-m];
+    }
+  if (!(n&1))
+    tmp[n-1]=data[n/2];
+  memcpy (data,tmp,n*sizeof(double));
+  DEALLOC(tmp);
+  }
+
+void real_plan_forward_fftw (real_plan plan, double *data)
+  {
+  real_plan_forward_fftpack (plan, data);
+  fftpack2halfcomplex (data,plan->length);
+  }
+
+void real_plan_backward_fftpack (real_plan plan, double *data)
+  {
+  if (plan->bluestein)
+    {
+    size_t m;
+    size_t n=plan->length;
+    double *tmp = RALLOC(double,2*n);
+    tmp[0]=data[0];
+    tmp[1]=0.;
+    memcpy (tmp+2,data+1, (n-1)*sizeof(double));
+    if ((n&1)==0) tmp[n+1]=0.;
+    for (m=2; m<n; m+=2)
+      {
+      tmp[2*n-m]=tmp[m];
+      tmp[2*n-m+1]=-tmp[m+1];
+      }
+    bluestein (n, tmp, plan->work, 1);
+    for (m=0; m<n; ++m)
+      data[m] = tmp[2*m];
+    DEALLOC(tmp);
+    }
+  else
+    rfftb (plan->length, data, plan->work);
+  }
+
+void real_plan_backward_fftw (real_plan plan, double *data)
+  {
+  halfcomplex2fftpack (data,plan->length);
+  real_plan_backward_fftpack (plan, data);
+  }
+
+void real_plan_forward_c (real_plan plan, double *data)
+  {
+  size_t m;
+  size_t n=plan->length;
+
+  if (plan->bluestein)
+    {
+    for (m=1; m<2*n; m+=2)
+      data[m]=0;
+    bluestein (plan->length, data, plan->work, -1);
+    data[1]=0;
+    for (m=2; m<n; m+=2)
+      {
+      double avg;
+      avg = 0.5*(data[2*n-m]+data[m]);
+      data[2*n-m] = data[m] = avg;
+      avg = 0.5*(data[2*n-m+1]-data[m+1]);
+      data[2*n-m+1] = avg;
+      data[m+1] = -avg;
+      }
+    if ((n&1)==0) data[n+1] = 0.;
+    }
+  else
+    {
+/* using "m+m" instead of "2*m" to avoid a nasty bug in Intel's compiler */
+    for (m=0; m<n; ++m) data[m+1] = data[m+m];
+    rfftf (n, data+1, plan->work);
+    data[0] = data[1];
+    data[1] = 0;
+    for (m=2; m<n; m+=2)
+      {
+      data[2*n-m]   =  data[m];
+      data[2*n-m+1] = -data[m+1];
+      }
+    if ((n&1)==0) data[n+1] = 0.;
+    }
+  }
+
+void real_plan_backward_c (real_plan plan, double *data)
+  {
+  size_t n=plan->length;
+
+  if (plan->bluestein)
+    {
+    size_t m;
+    data[1]=0;
+    for (m=2; m<n; m+=2)
+      {
+      double avg;
+      avg = 0.5*(data[2*n-m]+data[m]);
+      data[2*n-m] = data[m] = avg;
+      avg = 0.5*(data[2*n-m+1]-data[m+1]);
+      data[2*n-m+1] = avg;
+      data[m+1] = -avg;
+      }
+    if ((n&1)==0) data[n+1] = 0.;
+    bluestein (plan->length, data, plan->work, 1);
+    for (m=1; m<2*n; m+=2)
+      data[m]=0;
+    }
+  else
+    {
+    ptrdiff_t m;
+    data[1] = data[0];
+    rfftb (n, data+1, plan->work);
+    for (m=n-1; m>=0; --m)
+      {
+      data[2*m]   = data[m+1];
+      data[2*m+1] = 0.;
+      }
+    }
+  }
diff --git a/libfftpack/ls_fft.h b/libfftpack/ls_fft.h
new file mode 100644
index 0000000..8675454
--- /dev/null
+++ b/libfftpack/ls_fft.h
@@ -0,0 +1,162 @@
+/*
+ *  This file is part of libfftpack.
+ *
+ *  libfftpack is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libfftpack is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libfftpack; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file ls_fft.h
+ *  Interface for the LevelS FFT package.
+ *
+ *  Copyright (C) 2004 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_LS_FFT_H
+#define PLANCK_LS_FFT_H
+
+#include "c_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\defgroup fftgroup FFT interface
+This package is intended to calculate one-dimensional real or complex FFTs
+with high accuracy and good efficiency even for lengths containing large
+prime factors.
+The code is written in C, but a Fortran wrapper exists as well.
+
+Before any FFT is executed, a plan must be generated for it. Plan creation
+is designed to be fast, so that there is no significant overhead if the
+plan is only used once or a few times.
+
+The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the
+double precision incarnation by Hugh C. Pumphrey
+(http://www.netlib.org/fftpack/dp.tgz).
+
+I replaced the iterative sine and cosine calculations in radfg() and radbg()
+by an exact calculation, which slightly improves the transform accuracy for
+real FFTs with lengths containing large prime factors.
+
+Since FFTPACK becomes quite slow for FFT lengths with large prime factors
+(in the worst case of prime lengths it reaches \f$\mathcal{O}(n^2)\f$
+complexity), I implemented Bluestein's algorithm, which computes a FFT of length
+\f$n\f$ by several FFTs of length \f$n_2\ge 2n-1\f$ and a convolution. Since
+\f$n_2\f$ can be chosen to be highly composite, this algorithm is more efficient
+if \f$n\f$ has large prime factors. The longer FFTs themselves are then computed
+using the FFTPACK routines.
+Bluestein's algorithm was implemented according to the description on Wikipedia
+(<a href="http://en.wikipedia.org/wiki/Bluestein%27s_FFT_algorithm">
+http://en.wikipedia.org/wiki/Bluestein%27s_FFT_algorithm</a>).
+
+\b Thread-safety:
+All routines can be called concurrently; all information needed by
+<tt>ls_fft</tt> is stored in the plan variable. However, using the same plan
+variable on multiple threads simultaneously is not supported and will lead to
+data corruption.
+*/
+/*! \{ */
+
+typedef struct
+  {
+  double *work;
+  size_t length, worksize;
+  int bluestein;
+  } complex_plan_i;
+
+/*! The opaque handle type for complex-FFT plans. */
+typedef complex_plan_i * complex_plan;
+
+/*! Returns a plan for a complex FFT with \a length elements. */
+complex_plan make_complex_plan (size_t length);
+/*! Constructs a copy of \a plan. */
+complex_plan copy_complex_plan (complex_plan plan);
+/*! Destroys a plan for a complex FFT. */
+void kill_complex_plan (complex_plan plan);
+/*! Computes a complex forward FFT on \a data, using \a plan.
+    \a Data has the form <tt>r0, i0, r1, i1, ...,
+    r[length-1], i[length-1]</tt>. */
+void complex_plan_forward (complex_plan plan, double *data);
+/*! Computes a complex backward FFT on \a data, using \a plan.
+    \a Data has the form <tt>r0, i0, r1, i1, ...,
+    r[length-1], i[length-1]</tt>. */
+void complex_plan_backward (complex_plan plan, double *data);
+
+typedef struct
+  {
+  double *work;
+  size_t length, worksize;
+  int bluestein;
+  } real_plan_i;
+
+/*! The opaque handle type for real-FFT plans. */
+typedef real_plan_i * real_plan;
+
+/*! Returns a plan for a real FFT with \a length elements. */
+real_plan make_real_plan (size_t length);
+/*! Constructs a copy of \a plan. */
+real_plan copy_real_plan (real_plan plan);
+/*! Destroys a plan for a real FFT. */
+void kill_real_plan (real_plan plan);
+/*! Computes a real forward FFT on \a data, using \a plan
+    and assuming the FFTPACK storage scheme:
+    - on entry, \a data has the form <tt>r0, r1, ..., r[length-1]</tt>;
+    - on exit, it has the form <tt>r0, r1, i1, r2, i2, ...</tt>
+      (a total of \a length values). */
+void real_plan_forward_fftpack (real_plan plan, double *data);
+/*! Computes a real forward FFT on \a data, using \a plan
+    and assuming the FFTPACK storage scheme:
+    - on entry, \a data has the form <tt>r0, r1, i1, r2, i2, ...</tt>
+    (a total of \a length values);
+    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
+void real_plan_backward_fftpack (real_plan plan, double *data);
+/*! Computes a real forward FFT on \a data, using \a plan
+    and assuming the FFTW halfcomplex storage scheme:
+    - on entry, \a data has the form <tt>r0, r1, ..., r[length-1]</tt>;
+    - on exit, it has the form <tt>r0, r1, r2, ..., i2, i1</tt>. */
+void real_plan_forward_fftw (real_plan plan, double *data);
+/*! Computes a real backward FFT on \a data, using \a plan
+    and assuming the FFTW halfcomplex storage scheme:
+    - on entry, \a data has the form <tt>r0, r1, r2, ..., i2, i1</tt>.
+    - on exit, it has the form <tt>r0, r1, ..., r[length-1]</tt>. */
+void real_plan_backward_fftw (real_plan plan, double *data);
+/*! Computes a real forward FFT on \a data, using \a plan
+    and assuming a full-complex storage scheme:
+    - on entry, \a data has the form <tt>r0, [ignored], r1, [ignored], ...,
+      r[length-1], [ignored]</tt>;
+    - on exit, it has the form <tt>r0, i0, r1, i1, ...,
+      r[length-1], i[length-1]</tt>.
+    */
+void real_plan_forward_c (real_plan plan, double *data);
+/*! Computes a real backward FFT on \a data, using \a plan
+    and assuming a full-complex storage scheme:
+    - on entry, \a data has the form <tt>r0, i0, r1, i1, ...,
+      r[length-1], i[length-1]</tt>;
+    - on exit, it has the form <tt>r0, 0, r1, 0, ..., r[length-1], 0</tt>. */
+void real_plan_backward_c (real_plan plan, double *data);
+
+/*! \} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libfftpack/planck.make b/libfftpack/planck.make
new file mode 100644
index 0000000..c171367
--- /dev/null
+++ b/libfftpack/planck.make
@@ -0,0 +1,21 @@
+PKG:=libfftpack
+
+SD:=$(SRCROOT)/$(PKG)
+OD:=$(BLDROOT)/$(PKG)
+
+FULL_INCLUDE+= -I$(SD)
+
+HDR_$(PKG):=$(SD)/*.h
+LIB_$(PKG):=$(LIBDIR)/libfftpack.a
+OBJ:=fftpack.o bluestein.o ls_fft.o
+OBJ:=$(OBJ:%=$(OD)/%)
+
+ODEP:=$(HDR_$(PKG)) $(HDR_c_utils)
+
+$(OD)/fftpack.o: $(SD)/fftpack_inc.c
+
+$(OBJ): $(ODEP) | $(OD)_mkdir
+$(LIB_$(PKG)): $(OBJ)
+
+all_hdr+=$(HDR_$(PKG))
+all_lib+=$(LIB_$(PKG))
diff --git a/libsharp/complex_hacks.h b/libsharp/complex_hacks.h
new file mode 100644
index 0000000..99a7c2b
--- /dev/null
+++ b/libsharp/complex_hacks.h
@@ -0,0 +1,131 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*  \file complex_hacks.h
+ *  support for converting vector types and complex numbers
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#ifndef COMPLEX_HACKS_H
+#define COMPLEX_HACKS_H
+
+#include <math.h>
+#include <complex.h>
+#include "vecsupport.h"
+
+#define UNSAFE_CODE
+
+#if (VLEN==1)
+
+static inline complex double vhsum_cmplx(Tv a, Tv b)
+  { return a+_Complex_I*b; }
+
+static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict c1, complex double * restrict c2)
+  { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
+
+#endif
+
+#if (VLEN==2)
+
+static inline complex double vhsum_cmplx (Tv a, Tv b)
+  {
+#if defined(__SSE3__)
+  Tv tmp = _mm_hadd_pd(a,b);
+#else
+  Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
+                _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
+#endif
+  union {Tv v; complex double c; } u;
+  u.v=tmp; return u.c;
+  }
+
+static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
+  Tv d, complex double * restrict c1, complex double * restrict c2)
+  {
+#ifdef UNSAFE_CODE
+#if defined(__SSE3__)
+  vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
+  vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
+#else
+  vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
+                               _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
+  vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
+                               _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
+#endif
+#else
+  union {Tv v; complex double c; } u1, u2;
+#if defined(__SSE3__)
+  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
+#else
+  u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
+              _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
+  u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
+              _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
+#endif
+  *c1+=u1.c; *c2+=u2.c;
+#endif
+  }
+
+#endif
+
+#if (VLEN==4)
+
+static inline complex double vhsum_cmplx (Tv a, Tv b)
+  {
+  Tv tmp=_mm256_hadd_pd(a,b);
+  Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
+  tmp=_mm256_add_pd(tmp,tmp2);
+#ifdef UNSAFE_CODE
+  complex double ret;
+  *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
+  return ret;
+#else
+  union {Tv v; complex double c[2]; } u;
+  u.v=tmp; return u.c[0];
+#endif
+  }
+
+static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
+  complex double * restrict c1, complex double * restrict c2)
+  {
+  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
+  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
+     tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
+  tmp1=vadd(tmp3,tmp4);
+#ifdef UNSAFE_CODE
+  *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
+  *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
+#else
+  union {Tv v; complex double c[2]; } u;
+  u.v=tmp1;
+  *c1+=u.c[0]; *c2+=u.c[1];
+#endif
+  }
+
+#endif
+
+#endif
diff --git a/libsharp/libsharp.dox b/libsharp/libsharp.dox
new file mode 100644
index 0000000..2a5067c
--- /dev/null
+++ b/libsharp/libsharp.dox
@@ -0,0 +1,94 @@
+/*! \mainpage libsharp documentation
+  <ul>
+  <li>\ref introduction "Introduction"
+  <li><a href="modules.html">Programming interface</a>
+  </ul>
+ */
+
+/*! \page introduction Introduction to libsharp
+
+  "SHARP" is an acronym for <i>Performant Spherical Harmonic Transforms</i>.
+  All user-visible data types and functions in this library start with
+  the prefix "sharp_", or with "sharps_" and "sharpd_" for single- and
+  double precision variants, respectively.
+
+  <i>libsharp</i>'s main functionality is the conversion between <i>maps</i>
+  on the sphere and <i>spherical harmonic coefficients</i> (or <i>a_lm</i>).
+  A map is defined as a set of <i>rings</i>, which in turn consist of
+  individual pixels that
+  <ul>
+  <li>all have the same colatitude and</li>
+  <li>are uniformly spaced in azimuthal direction.</li>
+  </ul>
+  Consequently, a ring is completely defined by
+  <ul>
+  <li>its colatitute (in radians)</li>
+  <li>the number of pixels it contains</li>
+  <li>the azimuth (in radians) of the first pixel in the ring</li>
+  <li>the weight that must be multiplied to every pixel during a map
+      analysis (typically the solid angle of a pixel in the ring) </li>
+  <li>the offset of the first ring pixel in the <i>map array</i></li>
+  <li>the stride between consecutive pixels in the ring.</li>
+  </ul>
+  The map array is a one-dimensional array of type <i>float</i> or
+  <i>double</i>, which contains the values of all map pixels. It is assumed
+  that the pixels of every ring are stored inside this array in order of
+  increasing azimuth and with the specified stride. Note however that the rings
+  themselves can be stored in any order inside the array.
+
+  The a_lm array is a one-dimensional array of type <i>complex float</i> or
+  <i>complex double</i>, which contains all spherical harmonic coefficients
+  for a full or partial set of m quantum numbers with 0<=m<=mmax and m<=l<=lmax.
+  There is only one constraint on the internal structure of the array, which is:
+
+  <code>Index[a_l+1,m] = Index[a_l,m] + stride</code>
+
+  That means that coefficients with identical <i>m</i> but different <i>l</i>
+  can be interpreted as a one-dimensional array in <i>l</i> with a unique
+  stride.
+
+  Several functions are provided for efficient index computation in this array;
+  they are documented \ref almgroup "here".
+
+  Information about a pixelisation of the sphere is stored in objects of
+  type sharp_geom_info. It is possible to create such an object for any
+  supported pixelisation by using the function sharp_make_geometry_info();
+  however, several easier-to-use functions are \ref geominfogroup "supplied"
+  for generating often-used pixelisations like ECP grids, Gaussian grids,
+  and Healpix grids.
+
+  Currently, SHARP supports the following kinds of transforms:
+  <ul>
+  <li>scalar a_lm to map</li>
+  <li>scalar map to a_lm</li>
+<!--   <li>polarised a_lm to map</li>
+  <li>polarised map to a_lm</li> -->
+  <li>spin a_lm to map</li>
+  <li>spin map to a_lm</li>
+<!--  <li>scalar a_lm to maps of first derivatives</li> -->
+  </ul>
+
+  SHARP supports shared-memory parallelisation via OpenMP; this feature will
+  be automatically enabled if the compiler supports it.
+
+  SHARP will also make use of SSE2 and AVX instructions when compiled for a
+  platform known to support them.
+
+  Support for MPI-parallel transforms is also available; in this mode,
+  every MPI task must provide a unique subset of the map and a_lm coefficients.
+
+  The spherical harmonic transforms can be executed on double-precision and
+  single-precision maps and a_lm, but for accuracy reasons the computations
+  will always be performed in double precision. As a consequence,
+  single-precision transforms will most likely not be faster than their
+  double-precision counterparts, but they will require significantly less
+  memory.
+
+  Two example and benchmark programs are distributed with SHARP:
+  <ul>
+  <li>sharp_test.c checks the accuracy of the (iterative) map analysis
+      algorithm</li>
+  <li>sharp_bench.c determines the quickest transform strategy for a given
+      SHT</li>
+  </ul>
+*/
diff --git a/libsharp/oracle.inc b/libsharp/oracle.inc
new file mode 100644
index 0000000..7680861
--- /dev/null
+++ b/libsharp/oracle.inc
@@ -0,0 +1,9 @@
+static const int maxtr = 6;
+static const int nv_opt[6][2][3] = {
+{{4,2,-1},{2,1,-1}},
+{{4,2,-1},{2,1,-1}},
+{{5,2,-1},{5,2,-1}},
+{{5,2,-1},{5,2,-1}},
+{{5,2,-1},{5,2,-1}},
+{{5,2,-1},{5,2,-1}}
+};
diff --git a/libsharp/planck.make b/libsharp/planck.make
new file mode 100644
index 0000000..23dd2ad
--- /dev/null
+++ b/libsharp/planck.make
@@ -0,0 +1,29 @@
+PKG:=libsharp
+
+SD:=$(SRCROOT)/$(PKG)
+OD:=$(BLDROOT)/$(PKG)
+
+FULL_INCLUDE+= -I$(SD)
+
+HDR_$(PKG):=$(SD)/*.h
+LIB_$(PKG):=$(LIBDIR)/libsharp.a
+BIN:=sharp_test sharp_acctest sharp_test_mpi sharp_bench
+LIBOBJ:=ylmgen_c.o sharp.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o
+ALLOBJ:=$(LIBOBJ) sharp_test.o sharp_acctest.o sharp_test_mpi.o sharp_bench.o
+LIBOBJ:=$(LIBOBJ:%=$(OD)/%)
+ALLOBJ:=$(ALLOBJ:%=$(OD)/%)
+
+ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils)
+$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c
+$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/oracle.inc
+BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils)
+
+$(LIB_$(PKG)): $(LIBOBJ)
+
+$(ALLOBJ): $(ODEP) | $(OD)_mkdir
+BIN:=$(BIN:%=$(BINDIR)/%)
+$(BIN): $(BINDIR)/% : $(OD)/%.o $(BDEP)
+
+all_hdr+=$(HDR_$(PKG))
+all_lib+=$(LIB_$(PKG))
+all_cbin+=$(BIN)
diff --git a/libsharp/sharp.c b/libsharp/sharp.c
new file mode 100644
index 0000000..6f44ffa
--- /dev/null
+++ b/libsharp/sharp.c
@@ -0,0 +1,596 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp.c
+ *  Spherical transform library
+ *
+ *  Copyright (C) 2006-2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <math.h>
+#include "ls_fft.h"
+#include "ylmgen_c.h"
+#include "sharp.h"
+#include "c_utils.h"
+#include "sharp_core.h"
+#include "vec_utils.h"
+#include "walltime_c.h"
+
+typedef complex double dcmplx;
+typedef complex float  fcmplx;
+
+static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
+  {
+  static const int chunksize_min=500, nchunks_max=10;
+  *chunksize = IMAX(chunksize_min,(ndata+nchunks_max-1)/nchunks_max);
+  *chunksize = ((*chunksize+nmult-1)/nmult)*nmult;
+  *nchunks = (ndata+*chunksize-1) / *chunksize;
+  }
+
+typedef struct
+  {
+  double s;
+  int i;
+  } idxhelper;
+
+static int idx_compare (const void *xa, const void *xb)
+  {
+  const idxhelper *a=xa, *b=xb;
+  return (a->s > b->s) ? -1 : (a->s < b->s) ? 1 : 0;
+  }
+
+typedef struct
+  {
+  double phi0_;
+  dcmplx *shiftarr, *work;
+  int s_shift, s_work;
+  real_plan plan;
+  int norot;
+  } ringhelper;
+
+static void ringhelper_init (ringhelper *self)
+  {
+  static ringhelper rh_null = { 0, NULL, NULL, 0, 0, NULL, 0 };
+  *self = rh_null;
+  }
+
+static void ringhelper_destroy (ringhelper *self)
+  {
+  if (self->plan) kill_real_plan(self->plan);
+  DEALLOC(self->shiftarr);
+  DEALLOC(self->work);
+  ringhelper_init(self);
+  }
+
+static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
+  {
+  self->norot = (fabs(phi0)<1e-14);
+  if (!(self->norot))
+    if ((mmax!=self->s_shift-1) || (!FAPPROX(phi0,self->phi0_,1e-12)))
+      {
+      RESIZE (self->shiftarr,dcmplx,mmax+1);
+      self->s_shift = mmax+1;
+      self->phi0_ = phi0;
+      for (int m=0; m<=mmax; ++m)
+        self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
+      }
+  if (!self->plan) self->plan=make_real_plan(nph);
+  if (nph!=(int)self->plan->length)
+    {
+    kill_real_plan(self->plan);
+    self->plan=make_real_plan(nph);
+    }
+  GROW(self->work,dcmplx,self->s_work,nph);
+  }
+
+static int ringinfo_compare (const void *xa, const void *xb)
+  {
+  const sharp_ringinfo *a=xa, *b=xb;
+  return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
+  }
+static int ringpair_compare (const void *xa, const void *xb)
+  {
+  const sharp_ringpair *a=xa, *b=xb;
+  if (a->r1.nph==b->r1.nph)
+    return (a->r1.phi0 < b->r1.phi0) ? -1 : (a->r1.phi0 > b->r1.phi0) ? 1 : 0;
+  return (a->r1.nph<b->r1.nph) ? -1 : 1;
+  }
+
+void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
+  const ptrdiff_t *mstart, sharp_alm_info **alm_info)
+  {
+  sharp_alm_info *info = RALLOC(sharp_alm_info,1);
+  info->lmax = lmax;
+  info->nm = nm;
+  info->mval = RALLOC(int,nm);
+  info->mvstart = RALLOC(ptrdiff_t,nm);
+  info->stride = stride;
+  for (int mi=0; mi<nm; ++mi)
+    {
+    info->mval[mi] = mval[mi];
+    info->mvstart[mi] = mstart[mi];
+    }
+  *alm_info = info;
+  }
+
+void sharp_make_alm_info (int lmax, int mmax, int stride,
+  const ptrdiff_t *mstart, sharp_alm_info **alm_info)
+  {
+  int *mval=RALLOC(int,mmax+1);
+  for (int i=0; i<=mmax; ++i)
+    mval[i]=i;
+  sharp_make_general_alm_info (lmax, mmax+1, stride, mval, mstart, alm_info);
+  DEALLOC(mval);
+  }
+
+ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi)
+  { return self->mvstart[mi]+self->stride*l; }
+
+void sharp_destroy_alm_info (sharp_alm_info *info)
+  {
+  DEALLOC (info->mval);
+  DEALLOC (info->mvstart);
+  DEALLOC (info);
+  }
+
+void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
+  const int *stride, const double *phi0, const double *theta,
+  const double *weight, sharp_geom_info **geom_info)
+  {
+  sharp_geom_info *info = RALLOC(sharp_geom_info,1);
+  sharp_ringinfo *infos = RALLOC(sharp_ringinfo,nrings);
+
+  int pos=0;
+  info->pair=RALLOC(sharp_ringpair,nrings);
+  info->npairs=0;
+  *geom_info = info;
+
+  for (int m=0; m<nrings; ++m)
+    {
+    infos[m].theta = theta[m];
+    infos[m].cth = cos(theta[m]);
+    infos[m].sth = sin(theta[m]);
+    infos[m].weight = weight[m];
+    infos[m].phi0 = phi0[m];
+    infos[m].ofs = ofs[m];
+    infos[m].stride = stride[m];
+    infos[m].nph = nph[m];
+    }
+  qsort(infos,nrings,sizeof(sharp_ringinfo),ringinfo_compare);
+  while (pos<nrings)
+    {
+    info->pair[info->npairs].r1=infos[pos];
+    if ((pos<nrings-1) && FAPPROX(infos[pos].cth,-infos[pos+1].cth,1e-12))
+      {
+      info->pair[info->npairs].r2=infos[pos+1];
+      ++pos;
+      }
+    else
+      info->pair[info->npairs].r2.nph=-1;
+    ++pos;
+    ++info->npairs;
+    }
+  DEALLOC(infos);
+
+  qsort(info->pair,info->npairs,sizeof(sharp_ringpair),ringpair_compare);
+  }
+
+void sharp_destroy_geom_info (sharp_geom_info *geom_info)
+  {
+  DEALLOC (geom_info->pair);
+  DEALLOC (geom_info);
+  }
+
+static int sharp_get_mmax (int *mval, int nm)
+  {
+  int *mcheck=RALLOC(int,nm);
+  SET_ARRAY(mcheck,0,nm,0);
+  for (int i=0; i<nm; ++i)
+    {
+    int m_cur=mval[i];
+    UTIL_ASSERT((m_cur>=0) && (m_cur<nm), "m out of range");
+    UTIL_ASSERT(mcheck[m_cur]==0, "duplicate m value");
+    mcheck[m_cur]=1;
+    }
+  DEALLOC(mcheck);
+  return nm-1; // FIXME: this looks wrong
+  }
+
+static void ringhelper_phase2ring (ringhelper *self,
+  const sharp_ringinfo *info, void *data, int mmax, const dcmplx *phase,
+  int pstride, sharp_fde fde)
+  {
+  int nph = info->nph;
+  int stride = info->stride;
+
+  ringhelper_update (self, nph, mmax, info->phi0);
+  self->work[0]=phase[0];
+  SET_ARRAY(self->work,1,nph,0.);
+
+#if 0
+  if (self->norot)
+    for (int m=1; m<=mmax; ++m)
+      {
+      int idx1 = m%nph;
+      int idx2 = nph-1-((m-1)%nph);
+      self->work[idx1]+=phase[m*pstride];
+      self->work[idx2]+=conj(phase[m*pstride]);
+      }
+  else
+    for (int m=1; m<=mmax; ++m)
+      {
+      int idx1 = m%nph;
+      int idx2 = nph-1-((m-1)%nph);
+      dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
+      self->work[idx1]+=tmp;
+      self->work[idx2]+=conj(tmp);
+      }
+#else
+  int idx1=1, idx2=nph-1;
+  for (int m=1; m<=mmax; ++m)
+    {
+    dcmplx tmp = phase[m*pstride];
+    if(!self->norot) tmp*=self->shiftarr[m];
+    self->work[idx1]+=tmp;
+    self->work[idx2]+=conj(tmp);
+    if (++idx1>=nph) idx1=0;
+    if (--idx2<0) idx2=nph-1;
+    }
+#endif
+  real_plan_backward_c (self->plan, (double *)(self->work));
+  if (fde==DOUBLE)
+    for (int m=0; m<nph; ++m)
+      ((double *)data)[m*stride+info->ofs] += creal(self->work[m]);
+  else
+    for (int m=0; m<nph; ++m)
+      ((float *)data)[m*stride+info->ofs] += (float)creal(self->work[m]);
+  }
+
+static void ringhelper_ring2phase (ringhelper *self,
+  const sharp_ringinfo *info, const void *data, int mmax, dcmplx *phase,
+  int pstride, sharp_fde fde)
+  {
+  int nph = info->nph;
+#if 1
+  int maxidx = mmax; /* Enable this for traditional Healpix compatibility */
+#else
+  int maxidx = IMIN(nph-1,mmax);
+#endif
+
+  ringhelper_update (self, nph, mmax, -info->phi0);
+  if (fde==DOUBLE)
+    for (int m=0; m<nph; ++m)
+      self->work[m] = ((double *)data)[info->ofs+m*info->stride]*info->weight;
+  else
+    for (int m=0; m<nph; ++m)
+      self->work[m] = ((float *)data)[info->ofs+m*info->stride]*info->weight;
+
+  real_plan_forward_c (self->plan, (double *)self->work);
+
+  if (self->norot)
+    for (int m=0; m<=maxidx; ++m)
+      phase[m*pstride] = self->work[m%nph];
+  else
+    for (int m=0; m<=maxidx; ++m)
+      phase[m*pstride]=self->work[m%nph]*self->shiftarr[m];
+
+  for (int m=maxidx+1;m<=mmax; ++m)
+    phase[m*pstride]=0.;
+  }
+
+static void ringhelper_pair2phase (ringhelper *self, int mmax,
+  const sharp_ringpair *pair, const void *data, dcmplx *phase1, dcmplx *phase2,
+  int pstride, sharp_fde fde)
+  {
+  ringhelper_ring2phase (self, &(pair->r1), data, mmax, phase1, pstride, fde);
+  if (pair->r2.nph>0)
+    ringhelper_ring2phase (self, &(pair->r2), data, mmax, phase2, pstride, fde);
+  }
+
+static void ringhelper_phase2pair (ringhelper *self, int mmax,
+  const dcmplx *phase1, const dcmplx *phase2, int pstride,
+  const sharp_ringpair *pair, void *data, sharp_fde fde)
+  {
+  ringhelper_phase2ring (self, &(pair->r1), data, mmax, phase1, pstride, fde);
+  if (pair->r2.nph>0)
+    ringhelper_phase2ring (self, &(pair->r2), data, mmax, phase2, pstride, fde);
+  }
+
+static void fill_map (const sharp_geom_info *ginfo, void *map, double value,
+  sharp_fde fde)
+  {
+  for (int j=0;j<ginfo->npairs;++j)
+    {
+    if (fde==DOUBLE)
+      {
+      for (int i=0;i<ginfo->pair[j].r1.nph;++i)
+        ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=value;
+      for (int i=0;i<ginfo->pair[j].r2.nph;++i)
+        ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=value;
+      }
+    else
+      {
+      for (int i=0;i<ginfo->pair[j].r1.nph;++i)
+        ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]
+          =(float)value;
+      for (int i=0;i<ginfo->pair[j].r2.nph;++i)
+        ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]
+          =(float)value;
+      }
+    }
+  }
+
+static void fill_alm (const sharp_alm_info *ainfo, void *alm, dcmplx value,
+  sharp_fde fde)
+  {
+  if (fde==DOUBLE)
+    for (int mi=0;mi<ainfo->nm;++mi)
+      for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l)
+        ((dcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = value;
+  else
+    for (int mi=0;mi<ainfo->nm;++mi)
+      for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l)
+        ((fcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = (fcmplx)value;
+  }
+
+static void init_output (sharp_job *job)
+  {
+  if (job->add_output) return;
+  if (job->type == MAP2ALM)
+    for (int i=0; i<job->ntrans*job->nalm; ++i)
+      fill_alm (job->ainfo,job->alm[i],0.,job->fde);
+  else
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      fill_map (job->ginfo,job->map[i],0.,job->fde);
+  }
+
+static void alloc_phase (sharp_job *job, int nm, int ntheta)
+  { job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); }
+
+static void dealloc_phase (sharp_job *job)
+  { DEALLOC(job->phase); }
+
+//FIXME: set phase to zero if not MAP2ALM?
+static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
+  {
+  if (job->type != MAP2ALM) return;
+  int pstride = 2*job->ntrans*job->nmaps;
+#pragma omp parallel
+{
+  ringhelper helper;
+  ringhelper_init(&helper);
+#pragma omp for schedule(dynamic,1)
+  for (int ith=llim; ith<ulim; ++ith)
+    {
+    int dim2 = pstride*(ith-llim)*(mmax+1);
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      ringhelper_pair2phase(&helper,mmax,&job->ginfo->pair[ith], job->map[i],
+        &job->phase[dim2+2*i], &job->phase[dim2+2*i+1], pstride, job->fde);
+    }
+  ringhelper_destroy(&helper);
+} /* end of parallel region */
+  }
+
+static void alloc_almtmp (sharp_job *job, int lmax)
+  { job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); }
+
+static void dealloc_almtmp (sharp_job *job)
+  { DEALLOC(job->almtmp); }
+
+static void alm2almtmp (sharp_job *job, int lmax, int mi)
+  {
+  if (job->type!=MAP2ALM)
+    for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)
+      {
+      ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi);
+      double fct = (job->type==ALM2MAP) ? job->norm_l[l] :
+                    -fabs(job->norm_l[l])*sqrt(l*(l+1.));
+      for (int i=0; i<job->ntrans*job->nalm; ++i)
+        if (job->fde==DOUBLE)
+          job->almtmp[job->ntrans*job->nalm*l+i]
+            = ((dcmplx *)job->alm[i])[aidx]*fct;
+        else
+          job->almtmp[job->ntrans*job->nalm*l+i]
+            = ((fcmplx *)job->alm[i])[aidx]*fct;
+      }
+  else
+    SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi],
+              job->ntrans*job->nalm*(lmax+1),0.);
+  }
+
+static void almtmp2alm (sharp_job *job, int lmax, int mi)
+  {
+  if (job->type != MAP2ALM) return;
+  for (int l=job->ainfo->mval[mi]; l<=lmax; ++l)
+    {
+    ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi);
+    for (int i=0;i<job->ntrans*job->nalm;++i)
+      if (job->fde==DOUBLE)
+        ((dcmplx *)job->alm[i])[aidx] +=
+          job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l];
+      else
+        ((fcmplx *)job->alm[i])[aidx] +=
+          (fcmplx)(job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l]);
+    }
+  }
+
+static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
+  {
+  if (job->type == MAP2ALM) return;
+  int pstride = 2*job->ntrans*job->nmaps;
+#pragma omp parallel
+{
+  ringhelper helper;
+  ringhelper_init(&helper);
+#pragma omp for schedule(dynamic,1)
+  for (int ith=llim; ith<ulim; ++ith)
+    {
+    int dim2 = pstride*(ith-llim)*(mmax+1);
+    for (int i=0; i<job->ntrans*job->nmaps; ++i)
+      ringhelper_phase2pair(&helper,mmax,&job->phase[dim2+2*i],
+        &job->phase[dim2+2*i+1],pstride,&job->ginfo->pair[ith],job->map[i],
+        job->fde);
+    }
+  ringhelper_destroy(&helper);
+} /* end of parallel region */
+  }
+
+void sharp_execute_job (sharp_job *job)
+  {
+  double timer=wallTime();
+  job->opcnt=0;
+  int lmax = job->ainfo->lmax,
+      mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);
+
+  job->norm_l = Ylmgen_get_norm (lmax, job->spin);
+
+/* clear output arrays if requested */
+  init_output (job);
+
+  int nchunks, chunksize;
+  get_chunk_info(job->ginfo->npairs,job->nv*VLEN,&nchunks,&chunksize);
+  alloc_phase (job,mmax+1,chunksize);
+
+/* chunk loop */
+  for (int chunk=0; chunk<nchunks; ++chunk)
+    {
+    int llim=chunk*chunksize, ulim=IMIN(llim+chunksize,job->ginfo->npairs);
+    int *ispair = RALLOC(int,ulim-llim);
+    double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim);
+    idxhelper *stmp = RALLOC(idxhelper,ulim-llim);
+    for (int i=0; i<ulim-llim; ++i)
+      {
+      ispair[i] = job->ginfo->pair[i+llim].r2.nph>0;
+      cth[i] = job->ginfo->pair[i+llim].r1.cth;
+      sth[i] = job->ginfo->pair[i+llim].r1.sth;
+      stmp[i].s=sth[i];
+      stmp[i].i=i;
+      }
+    qsort (stmp,ulim-llim,sizeof(idxhelper),idx_compare);
+    int *idx = RALLOC(int,ulim-llim);
+    for (int i=0; i<ulim-llim; ++i)
+      idx[i]=stmp[i].i;
+    DEALLOC(stmp);
+
+/* map->phase where necessary */
+    map2phase (job, mmax, llim, ulim);
+
+#pragma omp parallel
+{
+    sharp_job ljob = *job;
+    ljob.opcnt=0;
+    Ylmgen_C generator;
+    Ylmgen_init (&generator,lmax,mmax,ljob.spin);
+    alloc_almtmp(&ljob,lmax);
+
+#pragma omp for schedule(dynamic,1)
+    for (int mi=0; mi<job->ainfo->nm; ++mi)
+      {
+/* alm->alm_tmp where necessary */
+      alm2almtmp (&ljob, lmax, mi);
+
+      inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, idx);
+
+/* alm_tmp->alm where necessary */
+      almtmp2alm (&ljob, lmax, mi);
+      }
+
+    Ylmgen_destroy(&generator);
+    dealloc_almtmp(&ljob);
+
+#pragma omp critical
+    job->opcnt+=ljob.opcnt;
+} /* end of parallel region */
+
+/* phase->map where necessary */
+    phase2map (job, mmax, llim, ulim);
+
+    DEALLOC(ispair);
+    DEALLOC(cth);
+    DEALLOC(sth);
+    DEALLOC(idx);
+    } /* end of chunk loop */
+
+  DEALLOC(job->norm_l);
+  dealloc_phase (job);
+  job->time=wallTime()-timer;
+  }
+
+static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, int spin,
+  int add_output, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans)
+  {
+  UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
+  UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin");
+  UTIL_ASSERT((type==MAP2ALM)||(type==ALM2MAP), "unsupported SHT type");
+  job->type = type;
+  job->spin = spin;
+  job->norm_l = NULL;
+  job->add_output = add_output;
+  job->nmaps = (type==ALM2MAP_DERIV1) ? 2 : ((spin>0) ? 2 : 1);
+  job->nalm = (type==ALM2MAP_DERIV1) ? 1 : ((spin>0) ? 2 : 1);
+  job->ginfo = geom_info;
+  job->ainfo = alm_info;
+  job->nv = sharp_nv_oracle (type, spin, ntrans);
+  job->time = 0.;
+  job->opcnt = 0;
+  job->ntrans = ntrans;
+  }
+
+void sharpd_build_job (sharp_job *job, sharp_jobtype type, int spin,
+  int add_output, dcmplx **alm, double **map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans)
+  {
+  sharp_build_job_common (job, type, spin, add_output, geom_info, alm_info,
+    ntrans);
+  job->alm=(void **)alm;
+  job->map=(void **)map;
+  job->fde=DOUBLE;
+  }
+
+void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin,
+  int add_output, fcmplx **alm, float **map, const sharp_geom_info *geom_info,
+  const sharp_alm_info *alm_info, int ntrans)
+  {
+  sharp_build_job_common (job, type, spin, add_output, geom_info, alm_info,
+    ntrans);
+  job->alm=(void **)alm;
+  job->map=(void **)map;
+  job->fde=FLOAT;
+  }
+
+int sharp_get_nv_max (void)
+{ return 6; }
+
+int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
+  {
+  UTIL_ASSERT(type!=ALM2MAP_DERIV1,"transform type not yet supported");
+
+#include "oracle.inc"
+
+  return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type];
+  }
+
+#include "sharp_mpi.c"
diff --git a/libsharp/sharp.h b/libsharp/sharp.h
new file mode 100644
index 0000000..590da0b
--- /dev/null
+++ b/libsharp/sharp.h
@@ -0,0 +1,213 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp.h
+ *  Interface for the spherical transform library.
+ *
+ *  Copyright (C) 2006-2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_SHARP_H
+#define PLANCK_SHARP_H
+
+#include <stddef.h>
+#include <complex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \internal
+    Helper type containing information about a single ring. */
+typedef struct
+  {
+  double theta, phi0, weight, cth, sth;
+  ptrdiff_t ofs;
+  int nph, stride;
+  } sharp_ringinfo;
+
+/*! \internal
+    Helper type containing information about a pair of rings with colatitudes
+    symmetric around the equator. */
+typedef struct
+  {
+  sharp_ringinfo r1,r2;
+  } sharp_ringpair;
+
+/*! \internal
+    Type holding all required information about a map geometry. */
+typedef struct
+  {
+  sharp_ringpair *pair;
+  int npairs;
+  } sharp_geom_info;
+
+/*! \defgroup almgroup Helpers for dealing with a_lm */
+/*! \{ */
+
+/*! \internal
+    Helper type for index calculation in a_lm arrays. */
+typedef struct
+  {
+  /*! Maximum \a l index of the array */
+  int lmax;
+  /*! Number of different \a m values in this object */
+  int nm;
+  /*! Array with \a nm entries containing the individual m values */
+  int *mval;
+  /*! Array with \a nm entries containing the (hypothetical) indices of
+      the coefficients with quantum numbers 0,\a mval[i] */
+  ptrdiff_t *mvstart;
+  /*! Stride between a_lm and a_(l+1),m */
+  ptrdiff_t stride;
+  } sharp_alm_info;
+
+/*! Creates an Alm data structure information from the following parameters:
+    \param lmax maximum \a l quantum number (>=0)
+    \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax)
+    \param stride the stride between consecutive a_lm entries
+    \param mstart the index of the (hypothetical) coefficient with the
+      quantum numbers 0,\a m. Must have \a mmax+1 entries.
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_alm_info (int lmax, int mmax, int stride,
+  const ptrdiff_t *mstart, sharp_alm_info **alm_info);
+/*! Creates an Alm data structure information from the following parameters:
+    \param lmax maximum \a l quantum number (>=0)
+    \param nm number of different \a m (<=\a lmax+1)
+    \param stride the stride between consecutive a_lm entries
+    \param mval array with \a nm entries containing the individual m values
+    \param mvstart array with \a nm entries containing the (hypothetical)
+      indices of the coefficients with the quantum numbers 0,\a mval[i]
+    \param alm_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
+  const ptrdiff_t *mvstart, sharp_alm_info **alm_info);
+/*! Returns the index of the coefficient with quantum numbers \a l,
+    \a mval[mi]. */
+ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi);
+/*! Deallocates the a_lm info object. */
+void sharp_destroy_alm_info (sharp_alm_info *info);
+
+/*! \} */
+
+/*! \defgroup geominfogroup Functions for dealing with geometry information */
+/*! \{ */
+
+/*! Creates a geometry information from a set of ring descriptions.
+    All arrays passed to this function must have \a nrings elements.
+    \param nrings the number of rings in the map
+    \param nph the number of pixels in each ring
+    \param ofs the index of the first pixel in each ring in the map array
+    \param stride the stride between consecutive pixels
+    \param phi0 the azimuth (in radians) of the first pixel in each ring
+    \param theta the colatitude (in radians) of each ring
+    \param weight the pixel weight to be used for the ring
+    \param geom_info will hold a pointer to the newly created data structure
+ */
+void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
+  const int *stride, const double *phi0, const double *theta,
+  const double *weight, sharp_geom_info **geom_info);
+
+/*! Deallocates the geometry information in \a info. */
+void sharp_destroy_geom_info (sharp_geom_info *info);
+
+/*! \} */
+
+/*! \defgroup jobgroup Functionality for defining and executing SHTs */
+/*! \{ */
+
+/*! Enumeration of SHARP job types. */
+typedef enum { MAP2ALM,       /*!< analysis */
+               ALM2MAP,       /*!< synthesis */
+               ALM2MAP_DERIV1 /*!< currently unused */
+             } sharp_jobtype;
+
+typedef enum { FLOAT, DOUBLE } sharp_fde;
+
+/*! \internal
+    Type holding all required information about an SHT job. */
+typedef struct
+  {
+  sharp_jobtype type;
+  int spin;
+  int add_output;
+  int nmaps, nalm;
+  sharp_fde fde;
+  void **map;
+  void **alm;
+  complex double *phase;
+  double *norm_l;
+  complex double *almtmp;
+  const sharp_geom_info *ginfo;
+  const sharp_alm_info *ainfo;
+  int nv;
+  double time;
+  int ntrans;
+  unsigned long long opcnt;
+  } sharp_job;
+
+/*! Initializes \a job with the appropriate parameters to perform the required
+  SHT.
+  \param type the type of SHT (currently ALM2MAP and MAP2ALM)
+  \param spin the spin of the quantities to be transformed
+  \param add_output if 0, the output arrays will be overwritten,
+    else the result will be added to the output arrays.
+  \param ntrans the number of simultaneous SHTs
+  \param alm contains pointers to the a_lm coefficients. If \a spin==0,
+    alm[0] points to the a_lm of the first SHT, alm[1] to those of the second
+    etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT,
+    alm[2] and alm[3] to those of the second, etc.
+  \param map contains pointers to the maps. If \a spin==0,
+    map[0] points to the map of the first SHT, map[1] to that of the second
+    etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT,
+    map[2] and map[3] to those of the second, etc.
+  \note \a map and \a a_lm must not be de-allocated until after the last call of
+    sharp_execute_job()! This is because the library does not copy the input
+    data, but only stores the pointers to the supplied maps and a_lm. */
+void sharpd_build_job (sharp_job *job, sharp_jobtype type, int spin,
+  int add_output, complex double **alm, double **map,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans);
+
+void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin,
+  int add_output, complex float **alm, float **map,
+  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans);
+
+/*! Execute the SHT job previously constructed by sharpd_build_job() or
+    sharps_build_job(). */
+void sharp_execute_job (sharp_job *job);
+
+/*! \} */
+
+/*! Internal */
+int sharp_get_nv_max (void);
+/*! Internal */
+int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsharp/sharp_acctest.c b/libsharp/sharp_acctest.c
new file mode 100644
index 0000000..3f36877
--- /dev/null
+++ b/libsharp/sharp_acctest.c
@@ -0,0 +1,217 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_acctest.c
+    Systematic accuracy test for libsharp.
+
+    Copyright (C) 2006-2012 Max-Planck-Society
+    \author Martin Reinecke
+*/
+
+#include <stdio.h>
+#include <string.h>
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
+#include "sharp.h"
+#include "sharp_geomhelpers.h"
+#include "sharp_almhelpers.h"
+#include "c_utils.h"
+#include "sharp_core.h"
+
+typedef complex double dcmplx;
+
+static double drand (double min, double max)
+  { return min + (max-min)*rand()/(RAND_MAX+1.0); }
+
+static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
+  {
+  for (int mi=0;mi<helper->nm; ++mi)
+    {
+    int m=helper->mval[mi];
+    for (int l=m;l<=helper->lmax; ++l)
+      {
+      if ((l<spin)&&(m<spin))
+        alm[sharp_alm_index(helper,l,mi)] = 0.;
+      else
+        {
+        double rv = drand(-1,1);
+        double iv = (m==0) ? 0 : drand(-1,1);
+        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
+        }
+      }
+    }
+  }
+
+static void measure_errors (dcmplx **alm, dcmplx **alm2,
+  ptrdiff_t nalms, int ncomp)
+  {
+  for (int i=0; i<ncomp; ++i)
+    {
+    double sum=0, sum2=0, maxdiff=0;
+    for (ptrdiff_t m=0; m<nalms; ++m)
+      {
+      double x=creal(alm[i][m])-creal(alm2[i][m]),
+             y=cimag(alm[i][m])-cimag(alm2[i][m]);
+      sum+=x*x+y*y;
+      sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
+      if (fabs(x)>maxdiff) maxdiff=fabs(x);
+      if (fabs(y)>maxdiff) maxdiff=fabs(y);
+      }
+    sum=sqrt(sum/nalms);
+    sum2=sqrt(sum2/nalms);
+    UTIL_ASSERT((maxdiff<1e-10)&&(sum/sum2<1e-10),"error");
+    }
+  }
+
+static void check_sign_scale(void)
+  {
+  int lmax=50;
+  int mmax=lmax;
+  sharp_geom_info *tinfo;
+  int nrings=lmax+1;
+  int ppring=2*lmax+2;
+  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+  sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
+
+  double **map;
+  ALLOC2D(map,double,2,npix);
+
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,2,nalms);
+  for (int i=0; i<2; ++i)
+    for (int j=0; j<nalms; ++j)
+      alm[i][j]=1.+_Complex_I;
+
+  sharp_job job;
+  sharpd_build_job(&job,ALM2MAP,0,0,&alm[0],&map[0],tinfo,alms,1);
+  sharp_execute_job(&job);
+  UTIL_ASSERT(FAPPROX(map[0][0     ], 3.588246976618616912e+00,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2], 4.042209792157496651e+01,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.234675107554816442e+01,1e-12),"error");
+
+  sharpd_build_job(&job,ALM2MAP,1,0,&alm[0],&map[0],tinfo,alms,1);
+  sharp_execute_job(&job);
+  UTIL_ASSERT(FAPPROX(map[0][0     ], 2.750897760535633285e+00,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2], 3.137704477368562905e+01,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-8.405730859837063917e+01,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-2.398026536095463346e+00,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-4.961140548331700728e+01,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.412765834230440021e+01,1e-12),"error");
+
+  sharpd_build_job(&job,ALM2MAP,2,0,&alm[0],&map[0],tinfo,alms,1);
+  sharp_execute_job(&job);
+  UTIL_ASSERT(FAPPROX(map[0][0     ],-1.398186224727334448e+00,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[0][npix/2],-2.456676000884031197e+01,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[0][npix-1],-1.516249174408820863e+02,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[1][0     ],-3.173406200299964119e+00,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[1][npix/2],-5.831327404513146462e+01,1e-12),"error");
+  UTIL_ASSERT(FAPPROX(map[1][npix-1],-1.863257892248353897e+01,1e-12),"error");
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
+
+  sharp_destroy_alm_info(alms);
+  sharp_destroy_geom_info(tinfo);
+  }
+
+static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
+  ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int nv)
+  {
+  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  double **map;
+  ALLOC2D(map,double,ncomp,npix);
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+
+  srand(4);
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,ncomp,nalms);
+  for (int i=0; i<ncomp; ++i)
+    random_alm(alm[i],alms,spin);
+
+  dcmplx **alm2;
+  ALLOC2D(alm2,dcmplx,ncomp,nalms);
+
+  sharp_job job;
+  sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+  job.nv=nv;
+  sharp_execute_job(&job);
+
+  sharpd_build_job(&job,MAP2ALM,spin,0,&alm2[0],&map[0],tinfo,alms,ntrans);
+  job.nv=nv;
+  sharp_execute_job(&job);
+  measure_errors(alm,alm2,nalms,ncomp);
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
+  DEALLOC2D(alm2);
+
+  sharp_destroy_alm_info(alms);
+  }
+
+int main(void)
+  {
+#ifdef USE_MPI
+  MPI_Init(NULL,NULL);
+#endif
+  module_startup_c("sharp_acctest",1,1,"",1);
+
+  int lmax=127;
+
+  printf("Checking signs and scales.\n");
+  check_sign_scale();
+  printf("Passed.\n\n");
+
+  printf("Testing map analysis accuracy.\n");
+
+  sharp_geom_info *tinfo;
+  int nrings=lmax+1;
+  int ppring=2*lmax+2;
+  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+  sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
+  for (int nv=1; nv<=6; ++nv)
+    for (int ntrans=1; ntrans<=6; ++ntrans)
+      {
+      check_accuracy(tinfo,lmax,lmax,npix,0,ntrans,nv);
+      check_accuracy(tinfo,lmax,lmax,npix,1,ntrans,nv);
+      check_accuracy(tinfo,lmax,lmax,npix,2,ntrans,nv);
+      check_accuracy(tinfo,lmax,lmax,npix,3,ntrans,nv);
+      check_accuracy(tinfo,lmax,lmax,npix,30,ntrans,nv);
+      }
+  sharp_destroy_geom_info(tinfo);
+  printf("Passed.\n\n");
+
+#ifdef USE_MPI
+  MPI_Finalize();
+#endif
+  return 0;
+  }
diff --git a/libsharp/sharp_almhelpers.c b/libsharp/sharp_almhelpers.c
new file mode 100644
index 0000000..039e6f9
--- /dev/null
+++ b/libsharp/sharp_almhelpers.c
@@ -0,0 +1,68 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_almhelpers.c
+ *  Spherical transform library
+ *
+ *  Copyright (C) 2008-2011 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include "sharp_almhelpers.h"
+#include "c_utils.h"
+
+void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
+  sharp_alm_info **alm_info)
+  {
+  sharp_alm_info *info = RALLOC(sharp_alm_info,1);
+  info->lmax = lmax;
+  info->nm = mmax+1;
+  info->mval = RALLOC(int,mmax+1);
+  info->mvstart = RALLOC(ptrdiff_t,mmax+1);
+  info->stride = stride;
+  int tval = 2*lmax+1;
+  for (ptrdiff_t m=0; m<=mmax; ++m)
+    {
+    info->mval[m] = m;
+    info->mvstart[m] = stride*((m*(tval-m))>>1);
+    }
+  *alm_info = info;
+  }
+
+void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
+  sharp_alm_info **alm_info)
+  {
+  sharp_alm_info *info = RALLOC(sharp_alm_info,1);
+  info->lmax = lmax;
+  info->nm = mmax+1;
+  info->mval = RALLOC(int,mmax+1);
+  info->mvstart = RALLOC(ptrdiff_t,mmax+1);
+  info->stride = stride;
+  for (ptrdiff_t m=0; m<=mmax; ++m)
+    {
+    info->mval[m] = m;
+    info->mvstart[m] = stride*m*(lmax+1);
+    }
+  *alm_info = info;
+  }
diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h
new file mode 100644
index 0000000..c6cb35a
--- /dev/null
+++ b/libsharp/sharp_almhelpers.h
@@ -0,0 +1,57 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_almhelpers.h
+ *  SHARP helper function for the creation of a_lm data structures
+ *
+ *  Copyright (C) 2008-2011 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_SHARP_ALMHELPERS_H
+#define PLANCK_SHARP_ALMHELPERS_H
+
+#include "sharp.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Initialises an a_lm data structure according to the scheme used by
+    Healpix_cxx.
+    \ingroup almgroup */
+void sharp_make_triangular_alm_info (int lmax, int mmax, int stride,
+  sharp_alm_info **alm_info);
+
+/*! Initialises an a_lm data structure according to the scheme used by
+    Fortran Healpix
+    \ingroup almgroup */
+void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride,
+  sharp_alm_info **alm_info);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsharp/sharp_bench.c b/libsharp/sharp_bench.c
new file mode 100644
index 0000000..185dc9e
--- /dev/null
+++ b/libsharp/sharp_bench.c
@@ -0,0 +1,143 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_bench.c
+    Copyright (C) 2012 Max-Planck-Society
+    \author Martin Reinecke
+*/
+
+#include <stdio.h>
+#include <string.h>
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
+#include "sharp.h"
+#include "sharp_geomhelpers.h"
+#include "sharp_almhelpers.h"
+#include "c_utils.h"
+#include "sharp_core.h"
+
+typedef complex double dcmplx;
+
+static void bench_sht (int spin, int nv, sharp_jobtype type,
+  int ntrans, double *time, unsigned long long *opcnt)
+  {
+  int lmax=2047;
+  int mmax=128;
+  int nrings=512;
+  int ppring=1024;
+  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+  sharp_geom_info *tinfo;
+  sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
+
+  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  double **map;
+  ALLOC2D(map,double,ncomp,npix);
+  SET_ARRAY(map[0],0,npix*ncomp,0.);
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,ncomp,nalms);
+  SET_ARRAY(alm[0],0,nalms*ncomp,0.);
+
+  int nruns=0;
+  sharp_job job;
+  sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+  job.nv=nv;
+  *time=1e30;
+  *opcnt=1000000000000000;
+  do
+    {
+    sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+    job.nv=nv;
+    sharp_execute_job(&job);
+
+    if (job.opcnt<*opcnt) *opcnt=job.opcnt;
+    if (job.time<*time) *time=job.time;
+    }
+  while (++nruns < 4);
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
+
+  sharp_destroy_alm_info(alms);
+  sharp_destroy_geom_info(tinfo);
+  }
+
+int main(void)
+  {
+#ifdef USE_MPI
+  MPI_Init(NULL,NULL);
+#endif
+  module_startup_c("sharp_bench",1,1,"",1);
+
+  printf("Benchmarking SHTs.\n\n");
+  FILE *fp=fopen("oracle.inc","w");
+  UTIL_ASSERT(fp, "failed to open oracle file for writing");
+  fprintf(fp,"static const int maxtr = 6;\n");
+  fprintf(fp,"static const int nv_opt[6][2][3] = {\n");
+
+  for (int ntr=1; ntr<=6; ++ntr)
+    {
+    fprintf(fp,"{");
+    for (int spin=0; spin<=2; spin+=2)
+      {
+      fprintf(fp,"{");
+      for (sharp_jobtype type=MAP2ALM; type<=ALM2MAP; ++type)
+        {
+        int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr);
+        unsigned long long opmin=1000000000000000, op;
+        double tmin=1e30;
+        double *time=RALLOC(double,sharp_get_nv_max()+1);
+        for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
+          {
+          bench_sht (spin,nv,type,ntr,&time[nv],&op);
+          if (op<opmin) opmin=op;
+          if (time[nv]<tmin)
+            { tmin=time[nv]; nvbest=nv; }
+          }
+        printf("nt: %d  %s  spin: %d   nv: %d   time: %6.3f   perf: %6.3f"
+          "   dev[%d]: %6.2f%%\n",ntr,(type==ALM2MAP)?"alm2map":"map2alm",
+          spin,nvbest,tmin,opmin/tmin*1e-9,nvoracle,
+          (time[nvoracle]-tmin)/tmin*100.);
+        DEALLOC(time);
+        fprintf(fp,"%d",nvbest);
+        fprintf(fp,(type==MAP2ALM)?",":",-1");
+        }
+      fprintf(fp,(spin==0)?"},":"}");
+      printf("\n");
+      }
+    fprintf(fp,(ntr<6)?"},\n":"}\n");
+    }
+  fprintf(fp,"};\n");
+  fclose(fp);
+#ifdef USE_MPI
+  MPI_Finalize();
+#endif
+  return 0;
+  }
diff --git a/libsharp/sharp_core.c b/libsharp/sharp_core.c
new file mode 100644
index 0000000..ba5cd88
--- /dev/null
+++ b/libsharp/sharp_core.c
@@ -0,0 +1,238 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_core.c
+ *  Computational core
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <string.h>
+#include "vecsupport.h"
+#include "complex_hacks.h"
+#include "ylmgen_c.h"
+#include "sharp.h"
+#include "sharp_core.h"
+#include "c_utils.h"
+
+typedef complex double dcmplx;
+
+#define MAXJOB_SPECIAL 2
+
+#define XCONCAT2(a,b) a##_##b
+#define CONCAT2(a,b) XCONCAT2(a,b)
+#define XCONCAT3(a,b,c) a##_##b##_##c
+#define CONCAT3(a,b,c) XCONCAT3(a,b,c)
+
+#define nvec 1
+#include "sharp_inchelper1.inc.c"
+#undef nvec
+
+#define nvec 2
+#include "sharp_inchelper1.inc.c"
+#undef nvec
+
+#define nvec 3
+#include "sharp_inchelper1.inc.c"
+#undef nvec
+
+#define nvec 4
+#include "sharp_inchelper1.inc.c"
+#undef nvec
+
+#define nvec 5
+#include "sharp_inchelper1.inc.c"
+#undef nvec
+
+#define nvec 6
+#include "sharp_inchelper1.inc.c"
+#undef nvec
+
+void inner_loop (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx)
+  {
+  int njobs=job->ntrans;
+  if (njobs<=MAXJOB_SPECIAL)
+    {
+    switch (njobs*16+job->nv)
+      {
+#if (MAXJOB_SPECIAL>=1)
+      case 0x11:
+        CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x12:
+        CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x13:
+        CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x14:
+        CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x15:
+        CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x16:
+        CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+#endif
+#if (MAXJOB_SPECIAL>=2)
+      case 0x21:
+        CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x22:
+        CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x23:
+        CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x24:
+        CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x25:
+        CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x26:
+        CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+#endif
+#if (MAXJOB_SPECIAL>=3)
+      case 0x31:
+        CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x32:
+        CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x33:
+        CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x34:
+        CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x35:
+        CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x36:
+        CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+#endif
+#if (MAXJOB_SPECIAL>=4)
+      case 0x41:
+        CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x42:
+        CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x43:
+        CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x44:
+        CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x45:
+        CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x46:
+        CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+#endif
+#if (MAXJOB_SPECIAL>=5)
+      case 0x51:
+        CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x52:
+        CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x53:
+        CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x54:
+        CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x55:
+        CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x56:
+        CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+#endif
+#if (MAXJOB_SPECIAL>=6)
+      case 0x61:
+        CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x62:
+        CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x63:
+        CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x64:
+        CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x65:
+        CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+      case 0x66:
+        CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx);
+        return;
+#endif
+      }
+    }
+#if (MAXJOB_SPECIAL<6)
+  else
+    {
+    switch (job->nv)
+      {
+      case 1:
+        CONCAT2(inner_loop,1)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+        return;
+      case 2:
+        CONCAT2(inner_loop,2)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+        return;
+      case 3:
+        CONCAT2(inner_loop,3)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+        return;
+      case 4:
+        CONCAT2(inner_loop,4)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+        return;
+      case 5:
+        CONCAT2(inner_loop,5)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+        return;
+      case 6:
+        CONCAT2(inner_loop,6)
+          (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans);
+        return;
+      }
+    }
+#endif
+  UTIL_FAIL("Incorrect vector parameters");
+  }
diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h
new file mode 100644
index 0000000..0699074
--- /dev/null
+++ b/libsharp/sharp_core.h
@@ -0,0 +1,49 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_core.h
+ *  Interface for the computational core
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_SHARP_CORE_H
+#define PLANCK_SHARP_CORE_H
+
+#include "sharp.h"
+#include "ylmgen_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void inner_loop (sharp_job *job, const int *ispair,const double *cth,
+  const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c
new file mode 100644
index 0000000..b892f53
--- /dev/null
+++ b/libsharp/sharp_core_inc.c
@@ -0,0 +1,268 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_core_inc.c
+ *  Type-dependent code for the computational core
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+typedef struct
+  { Tv v[nvec]; } Tb;
+
+typedef union
+  { Tb b; double s[VLEN*nvec]; } Y(Tbu);
+
+typedef struct
+  { Tb r, i; } Y(Tbri);
+
+typedef struct
+  { Tb qr, qi, ur, ui; } Y(Tbqu);
+
+typedef struct
+  { double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
+
+typedef struct
+  { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
+
+typedef union
+  { Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
+
+typedef union
+  { Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
+
+static inline Tb Y(Tbconst)(double val)
+  {
+  Tv v=vload(val);
+  Tb res;
+  for (int i=0; i<nvec; ++i) res.v[i]=v;
+  return res;
+  }
+
+static inline void Y(Tbmuleq1)(Tb * restrict a, double b)
+  { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
+
+static inline Tb Y(Tbprod)(Tb a, Tb b)
+  { Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
+
+static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
+  { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
+
+static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd,
+  Tb * restrict ress)
+  {
+  Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
+
+  do
+    {
+    if (npow&1)
+      {
+      for (int i=0; i<nvec; ++i)
+        {
+        vmuleq(res.v[i],val.v[i]);
+        vaddeq(scale.v[i],scaleint.v[i]);
+        Tv mask=vlt(vabs(res.v[i]),vload(fsmall));
+        vmuleq(res.v[i],vblend(mask,vload(fbig),vone));
+        vsubeq(scale.v[i],vblend(mask,vone,vzero));
+        }
+      }
+    for (int i=0; i<nvec; ++i)
+      {
+      vmuleq(val.v[i],val.v[i]);
+      vaddeq(scaleint.v[i],scaleint.v[i]);
+      Tv mask = vlt(vabs(val.v[i]),vload(fsmall));
+      vmuleq(val.v[i],vblend(mask,vload(fbig),vone));
+      vsubeq(scaleint.v[i],vblend(mask,vone,vzero));
+      }
+    }
+  while(npow>>=1);
+
+  *resd=res;
+  *ress=scale;
+  }
+
+static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
+  Tb * restrict scale)
+  {
+  int did_scale=0;
+  for (int i=0;i<nvec; ++i)
+    {
+    Tv mask = vgt(vabs(lam2->v[i]),vone);
+    if (vanyTrue(mask))
+      {
+      did_scale=1;
+      Tv fact = vblend(mask,vload(fsmall),vone);
+      vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact);
+      vaddeq(scale->v[i],vblend(mask,vone,vzero));
+      }
+    }
+  return did_scale;
+  }
+
+static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale)
+  {
+  const Tv vfsmall=vload(fsmall), vfbig=vload(fbig);
+  for (int i=0;i<nvec; ++i)
+    {
+    Tv mask = vgt(vabs(val->v[i]),vone);
+    while (vanyTrue(mask))
+      {
+      vmuleq(val->v[i],vblend(mask,vfsmall,vone));
+      vaddeq(scale->v[i],vblend(mask,vone,vzero));
+      mask = vgt(vabs(val->v[i]),vone);
+      }
+    mask = vlt(vabs(val->v[i]),vfsmall);
+    mask = vand(mask,vne(val->v[i],vzero));
+    while (vanyTrue(mask))
+      {
+      vmuleq(val->v[i],vblend(mask,vfbig,vone));
+      vsubeq(scale->v[i],vblend(mask,vone,vzero));
+      mask = vlt(vabs(val->v[i]),vfsmall);
+      mask = vand(mask,vne(val->v[i],vzero));
+      }
+    }
+  }
+
+static inline int Y(TballLt)(Tb a,double b)
+  {
+  Tv vb=vload(b);
+  Tv res=vlt(a.v[0],vb);
+  for (int i=1; i<nvec; ++i)
+    res=vand(res,vlt(a.v[i],vb));
+  return vallTrue(res);
+  }
+static inline int Y(TballGt)(Tb a,double b)
+  {
+  Tv vb=vload(b);
+  Tv res=vgt(a.v[0],vb);
+  for (int i=1; i<nvec; ++i)
+    res=vand(res,vgt(a.v[i],vb));
+  return vallTrue(res);
+  }
+
+static inline void Y(getCorfac)(Tb scale, Tb * restrict corfac,
+  const double * restrict cf)
+  {
+  Y(Tbu) sc, corf;
+  sc.b=scale;
+  for (int i=0; i<VLEN*nvec; ++i)
+    corf.s[i] = (sc.s[i]<minscale) ? 0. : cf[(int)(sc.s[i])-minscale];
+  *corfac=corf.b;
+  }
+
+static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
+  Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
+  const Ylmgen_C * restrict gen)
+  {
+  int l=gen->m;
+  Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
+  Y(mypow) (sth,l,&lam_2,&scale);
+  Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
+  Y(normalize)(&lam_2,&scale);
+
+  int below_limit = Y(TballLt)(scale,limscale);
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    r0=vload(gen->rf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    if (Y(rescale)(&lam_1,&lam_2,&scale))
+      below_limit = Y(TballLt)(scale,limscale);
+    l+=2;
+    }
+  *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
+  }
+
+static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
+  Tb * restrict ryp, Tb * restrict rym, const Tb cth, const ylmgen_dbl3 fx)
+  {
+  Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
+  for (int i=0; i<nvec; ++i)
+    {
+    rxp->v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])),
+                vmul(fx2,rxp->v[i]));
+    rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])),
+                vmul(fx2,rxm->v[i]));
+    }
+  }
+
+static void Y(iter_to_ieee_spin) (const Tb cth, int *l_,
+  Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
+  Tb * scalep_, Tb * scalem_, const Ylmgen_C * restrict gen)
+  {
+  const ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb cth2, sth2;
+  for (int i=0; i<nvec; ++i)
+    {
+    cth2.v[i]=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
+    cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
+    sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
+    sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
+    }
+
+  Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
+  Y(mypow)(cth2,gen->cosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps);
+  Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps);
+
+  Tb rec2p, rec2m, scalep, scalem;
+  Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
+  Tv prefac=vload(gen->prefac[gen->m]),
+     prescale=vload(gen->fscale[gen->m]);
+  for (int i=0; i<nvec; ++i)
+    {
+    rec2p.v[i]=vmul(vmul(prefac,ccp.v[i]),ssp.v[i]);
+    scalep.v[i]=vadd(vadd(prescale,ccps.v[i]),ssps.v[i]);
+    rec2m.v[i]=vmul(vmul(prefac,csp.v[i]),scp.v[i]);
+    scalem.v[i]=vadd(vadd(prescale,csps.v[i]),scps.v[i]);
+    if (gen->preMinus_p)
+      rec2p.v[i]=vneg(rec2p.v[i]);
+    if (gen->preMinus_m)
+      rec2m.v[i]=vneg(rec2m.v[i]);
+    if (gen->s&1)
+      rec2p.v[i]=vneg(rec2p.v[i]);
+    }
+  Y(normalize)(&rec2m,&scalem); Y(normalize)(&rec2p,&scalep);
+
+  int l=gen->mhi;
+
+  int below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
+  while (below_limit)
+    {
+    if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale);
+    l+=2;
+    }
+
+  *l_=l;
+  *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
+  *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
+  }
diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c
new file mode 100644
index 0000000..e42e6de
--- /dev/null
+++ b/libsharp/sharp_core_inc2.c
@@ -0,0 +1,702 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_core_inc2.c
+ *  Type-dependent code for the computational core
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+typedef struct
+  { Y(Tbri) j[njobs]; } Z(Tbrij);
+typedef union
+  { Z(Tbrij) b; Y(Tsri) j[njobs]; } Z(Tburij);
+typedef struct
+  { Y(Tbqu) j[njobs]; } Z(Tbquj);
+typedef union
+  { Z(Tbquj) b; Y(Tsqu) j[njobs]; } Z(Tbuquj);
+
+static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1,
+  Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
+  const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
+  int l, int lmax)
+  {
+#if (njobs>1)
+  while (l<lmax-2)
+    {
+    Tb lam_3, lam_4;
+    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
+    r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar2=vload(creal(alm[njobs*l+j])),
+         ai2=vload(cimag(alm[njobs*l+j])),
+         ar4=vload(creal(alm[njobs*(l+2)+j])),
+         ai4=vload(cimag(alm[njobs*(l+2)+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaaeq(p1->j[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
+        vfmaaeq(p1->j[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
+        }
+      Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
+         ai3=vload(cimag(alm[njobs*(l+1)+j])),
+         ar1=vload(creal(alm[njobs*(l+3)+j])),
+         ai1=vload(cimag(alm[njobs*(l+3)+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaaeq(p2->j[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
+        vfmaaeq(p2->j[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
+        }
+      }
+    r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
+    l+=4;
+    }
+#endif
+  while (l<lmax)
+    {
+    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),
+         ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
+        vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
+        }
+      ar=vload(creal(alm[njobs*(l+1)+j]));
+      ai=vload(cimag(alm[njobs*(l+1)+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(p2->j[j].r.v[i],lam_1.v[i],ar);
+        vfmaeq(p2->j[j].i.v[i],lam_1.v[i],ai);
+        }
+      }
+    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    l+=2;
+    }
+  if (l==lmax)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(p1->j[j].r.v[i],lam_2.v[i],ar);
+        vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai);
+        }
+      }
+    }
+  }
+
+static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1,
+  const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2,
+  const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax)
+  {
+  while (l<lmax)
+    {
+    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(tr1,lam_2.v[i],p1->j[j].r.v[i]);
+        vfmaeq(ti1,lam_2.v[i],p1->j[j].i.v[i]);
+        }
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(tr2,lam_1.v[i],p2->j[j].r.v[i]);
+        vfmaeq(ti2,lam_1.v[i],p2->j[j].i.v[i]);
+        }
+      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
+      }
+    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    l+=2;
+    }
+  if (l==lmax)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tre=vzero, tim=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(tre,lam_2.v[i],p1->j[j].r.v[i]);
+        vfmaeq(tim,lam_2.v[i],p1->j[j].i.v[i]);
+        }
+      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
+      }
+    }
+  }
+
+static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
+  sharp_job *job, Z(Tbrij) * restrict p1, Z(Tbrij) * restrict p2, int *done)
+  {
+  int l,lmax=gen->lmax;
+  Tb lam_1,lam_2,scale;
+  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+
+  Tb corfac;
+  Y(getCorfac)(scale,&corfac,gen->cf);
+  const ylmgen_dbl2 * restrict rf = gen->rf;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scale,minscale);
+  while (!full_ieee)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+        vfmaeq(p1->j[j].r.v[i],tmp,ar);
+        vfmaeq(p1->j[j].i.v[i],tmp,ai);
+        }
+      }
+    if (++l>lmax) break;
+    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+        vfmaeq(p2->j[j].r.v[i],tmp,ar);
+        vfmaeq(p2->j[j].i.v[i],tmp,ai);
+        }
+      }
+    if (++l>lmax) break;
+    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    if (Y(rescale)(&lam_1,&lam_2,&scale))
+      {
+      Y(getCorfac)(scale,&corfac,gen->cf);
+      full_ieee = Y(TballGt)(scale,minscale);
+      }
+    }
+  if (l>lmax) { *done=1; return; }
+
+  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
+  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  }
+
+static void Z(calc_map2alm) (const Tb cth, const Tb sth,
+  const Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1,
+  const Z(Tbrij) * restrict p2, int *done)
+  {
+  int lmax=gen->lmax;
+  Tb lam_1,lam_2,scale;
+  int l=gen->m;
+  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+
+  const ylmgen_dbl2 * restrict rf = gen->rf;
+  Tb corfac;
+  Y(getCorfac)(scale,&corfac,gen->cf);
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scale,minscale);
+  while (!full_ieee)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tre=vzero, tim=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+        vfmaeq(tre,tmp,p1->j[j].r.v[i]);
+        vfmaeq(tim,tmp,p1->j[j].i.v[i]);
+        }
+      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
+      }
+    if (++l>lmax) { *done=1; return; }
+    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tre=vzero, tim=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+        vfmaeq(tre,tmp,p2->j[j].r.v[i]);
+        vfmaeq(tim,tmp,p2->j[j].i.v[i]);
+        }
+      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
+      }
+    if (++l>lmax) { *done=1; return; }
+    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    if (Y(rescale)(&lam_1,&lam_2,&scale))
+      {
+      Y(getCorfac)(scale,&corfac,gen->cf);
+      full_ieee = Y(TballGt)(scale,minscale);
+      }
+    }
+
+  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
+  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax);
+  }
+
+static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py,
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm)
+  {
+  for (int j=0; j<njobs; ++j)
+    {
+    Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
+       acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lw=vadd(rxp.v[i],rxm.v[i]);
+      vfmaeq(px->j[j].qr.v[i],agr,lw);
+      vfmaeq(px->j[j].qi.v[i],agi,lw);
+      vfmaeq(px->j[j].ur.v[i],acr,lw);
+      vfmaeq(px->j[j].ui.v[i],aci,lw);
+      }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lx=vsub(rxm.v[i],rxp.v[i]);
+      vfmseq(py->j[j].qr.v[i],aci,lx);
+      vfmaeq(py->j[j].qi.v[i],acr,lx);
+      vfmaeq(py->j[j].ur.v[i],agi,lx);
+      vfmseq(py->j[j].ui.v[i],agr,lx);
+      }
+    }
+  }
+
+static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2,
+  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
+  const dcmplx * restrict alm1, const dcmplx * restrict alm2)
+  {
+  for (int j=0; j<njobs; ++j)
+    {
+    Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
+       acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
+    Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
+       acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
+      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
+      vfmaseq(p1->j[j].qr.v[i],agr1,lw1,aci2,lx2);
+      vfmaaeq(p1->j[j].qi.v[i],agi1,lw1,acr2,lx2);
+      vfmaaeq(p1->j[j].ur.v[i],acr1,lw1,agi2,lx2);
+      vfmaseq(p1->j[j].ui.v[i],aci1,lw1,agr2,lx2);
+      }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
+      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
+      vfmaseq(p2->j[j].qr.v[i],agr2,lw2,aci1,lx1);
+      vfmaaeq(p2->j[j].qi.v[i],agi2,lw2,acr1,lx1);
+      vfmaaeq(p2->j[j].ur.v[i],acr2,lw2,agi1,lx1);
+      vfmaseq(p2->j[j].ui.v[i],aci2,lw2,agr1,lx1);
+      }
+    }
+  }
+
+static inline void Z(saddstep2) (const Z(Tbquj) * restrict px,
+  const Z(Tbquj) * restrict py, const Tb * restrict rxp,
+  const Tb * restrict rxm, dcmplx * restrict alm)
+  {
+  for (int j=0; j<njobs; ++j)
+    {
+    Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lw=vadd(rxp->v[i],rxm->v[i]);
+      vfmaeq(agr,px->j[j].qr.v[i],lw);
+      vfmaeq(agi,px->j[j].qi.v[i],lw);
+      vfmaeq(acr,px->j[j].ur.v[i],lw);
+      vfmaeq(aci,px->j[j].ui.v[i],lw);
+      }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lx=vsub(rxm->v[i],rxp->v[i]);
+      vfmseq(agr,py->j[j].ui.v[i],lx);
+      vfmaeq(agi,py->j[j].ur.v[i],lx);
+      vfmaeq(acr,py->j[j].qi.v[i],lx);
+      vfmseq(aci,py->j[j].qr.v[i],lx);
+      }
+    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
+    }
+  }
+
+static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1,
+  Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax)
+  {
+  while (l<lmax)
+    {
+    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
+       fx2=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
+                        vmul(fx2,rec1p.v[i]));
+      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
+                        vmul(fx2,rec1m.v[i]));
+      }
+#if (njobs>1)
+    Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
+      &alm[2*njobs*(l+1)]);
+#else
+    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
+    Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)]);
+#endif
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
+                        vmul(fx2,rec2p.v[i]));
+      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
+                        vmul(fx2,rec2m.v[i]));
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]);
+  }
+
+static void Z(map2alm_spin_kernel) (Tb cth, const Z(Tbquj) * restrict p1,
+  const Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax)
+  {
+  while (l<lmax)
+    {
+    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
+       fx2=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
+                        vmul(fx2,rec1p.v[i]));
+      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
+                        vmul(fx2,rec1m.v[i]));
+      }
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
+    Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)]);
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
+                        vmul(fx2,rec2p.v[i]));
+      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
+                        vmul(fx2,rec2m.v[i]));
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l]);
+  }
+
+static void Z(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
+  sharp_job *job, Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, int *done)
+  {
+  int l, lmax=gen->lmax;
+  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
+  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
+  if (l>lmax)
+   { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+
+  const ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb corfacp,corfacm;
+  Y(getCorfac)(scalep,&corfacp,gen->cf);
+  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  while (!full_ieee)
+    {
+    Z(saddstep)(p1, p2,
+      Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), &alm[2*njobs*l]);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    Z(saddstep)(p2, p1,
+      Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), &alm[2*njobs*l]);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      {
+      Y(getCorfac)(scalep,&corfacp,gen->cf);
+      Y(getCorfac)(scalem,&corfacm,gen->cf);
+      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      }
+    }
+
+  if (l>lmax)
+    { *done=1; return; }
+
+  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
+  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
+  Z(alm2map_spin_kernel) (cth,p1,p2,
+    rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax);
+  }
+
+static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
+  sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2,
+  int *done)
+  {
+  int l, lmax=gen->lmax;
+  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
+  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
+  if (l>lmax) { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+
+  const ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb corfacp,corfacm;
+  Y(getCorfac)(scalep,&corfacp,gen->cf);
+  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  while (!full_ieee)
+    {
+    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
+    Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l]);
+    if (++l>lmax) { *done=1; return; }
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
+    Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l]);
+    if (++l>lmax) { *done=1; return; }
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      {
+      Y(getCorfac)(scalep,&corfacp,gen->cf);
+      Y(getCorfac)(scalem,&corfacm,gen->cf);
+      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      }
+    }
+
+  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
+  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
+  Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax);
+  }
+
+#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
+
+static void Z(inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
+  int mi, const int *idx)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case ALM2MAP:
+      {
+      if (job->spin==0)
+        {
+        int done=0;
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Z(Tburij) p1,p2; VZERO(p1); VZERO(p2);
+          if (!done)
+            {
+            Y(Tbu) cth, sth;
+
+            for (int i=0; i<nval; ++i)
+              {
+              int itot=i+ith;
+              if (itot>=ulim-llim) itot=ulim-llim-1;
+              itot=idx[itot];
+              cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+              }
+            Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
+            }
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot<ulim-llim)
+              {
+              itot=idx[itot];
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
+                complex double r1 = p1.j[j].r[i] + p1.j[j].i[i]*_Complex_I,
+                               r2 = p2.j[j].r[i] + p2.j[j].i[i]*_Complex_I;
+                job->phase[phas_idx] = r1+r2;
+                if (ispair[itot])
+                  job->phase[phas_idx+1] = r1-r2;
+                }
+              }
+            }
+          }
+        }
+      else
+        {
+        int done=0;
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Z(Tbuquj) p1,p2; VZERO(p1); VZERO(p2);
+          if (!done)
+            {
+            Y(Tbu) cth;
+
+            for (int i=0; i<nval; ++i)
+              {
+              int itot=i+ith;
+              if (itot>=ulim-llim) itot=ulim-llim-1;
+              itot=idx[itot];
+              cth.s[i]=cth_[itot];
+              }
+            Z(calc_alm2map_spin) (cth.b,gen,job,&p1.b,&p2.b,&done);
+            }
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot<ulim-llim)
+              {
+              itot=idx[itot];
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
+                complex double q1 = p1.j[j].qr[i] + p1.j[j].qi[i]*_Complex_I,
+                               q2 = p2.j[j].qr[i] + p2.j[j].qi[i]*_Complex_I,
+                               u1 = p1.j[j].ur[i] + p1.j[j].ui[i]*_Complex_I,
+                               u2 = p2.j[j].ur[i] + p2.j[j].ui[i]*_Complex_I;
+                job->phase[phas_idx] = q1+q2;
+                job->phase[phas_idx+2] = u1+u2;
+                if (ispair[itot])
+                  {
+                  dcmplx *phQ = &(job->phase[phas_idx+1]),
+                         *phU = &(job->phase[phas_idx+3]);
+                  *phQ = q1-q2;
+                  *phU = u1-u2;
+                  if ((gen->mhi-gen->m+gen->s)&1)
+                    { *phQ=-(*phQ); *phU=-(*phU); }
+                  }
+                }
+              }
+            }
+          }
+        }
+      break;
+      }
+    case ALM2MAP_DERIV1:
+      break;
+    case MAP2ALM:
+      {
+      if (job->spin==0)
+        {
+        int done=0;
+        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
+          {
+          Z(Tburij) p1, p2; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            itot=idx[itot];
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            if (i+ith<ulim-llim)
+              {
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
+                dcmplx ph1=job->phase[phas_idx];
+                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
+                p1.j[j].r[i]=creal(ph1+ph2); p1.j[j].i[i]=cimag(ph1+ph2);
+                p2.j[j].r[i]=creal(ph1-ph2); p2.j[j].i[i]=cimag(ph1-ph2);
+                }
+              }
+            }
+          Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b,&done);
+          }
+        }
+      else
+        {
+        int done=0;
+        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
+          {
+          Z(Tbuquj) p1, p2; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            itot=idx[itot];
+            cth.s[i]=cth_[itot];
+            if (i+ith<ulim-llim)
+              {
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
+                dcmplx p1Q=job->phase[phas_idx],
+                       p1U=job->phase[phas_idx+2],
+                       p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
+                       p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { p2Q=-p2Q; p2U=-p2U; }
+                p1.j[j].qr[i]=creal(p1Q+p2Q); p1.j[j].qi[i]=cimag(p1Q+p2Q);
+                p1.j[j].ur[i]=creal(p1U+p2U); p1.j[j].ui[i]=cimag(p1U+p2U);
+                p2.j[j].qr[i]=creal(p1Q-p2Q); p2.j[j].qi[i]=cimag(p1Q-p2Q);
+                p2.j[j].ur[i]=creal(p1U-p2U); p2.j[j].ui[i]=cimag(p1U-p2U);
+                }
+              }
+            }
+          Z(calc_map2alm_spin) (cth.b,gen,job,&p1.b,&p2.b,&done);
+          }
+        }
+      break;
+      }
+    }
+  }
+
+#undef VZERO
diff --git a/libsharp/sharp_core_inc3.c b/libsharp/sharp_core_inc3.c
new file mode 100644
index 0000000..9fecec4
--- /dev/null
+++ b/libsharp/sharp_core_inc3.c
@@ -0,0 +1,691 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_core_inc3.c
+ *  Type-dependent code for the computational core
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
+  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
+  int l, int lmax, int njobs)
+  {
+  while (l<lmax-2)
+    {
+    Tb lam_3, lam_4;
+    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
+    r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar2=vload(creal(alm[njobs*l+j])),
+         ai2=vload(cimag(alm[njobs*l+j])),
+         ar4=vload(creal(alm[njobs*(l+2)+j])),
+         ai4=vload(cimag(alm[njobs*(l+2)+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
+        vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
+        }
+      Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
+         ai3=vload(cimag(alm[njobs*(l+1)+j])),
+         ar1=vload(creal(alm[njobs*(l+3)+j])),
+         ai1=vload(cimag(alm[njobs*(l+3)+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
+        vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
+        }
+      }
+    r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
+    l+=4;
+    }
+  while (l<lmax)
+    {
+    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),
+         ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
+        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
+        }
+      ar=vload(creal(alm[njobs*(l+1)+j]));
+      ai=vload(cimag(alm[njobs*(l+1)+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(p2[j].r.v[i],lam_1.v[i],ar);
+        vfmaeq(p2[j].i.v[i],lam_1.v[i],ai);
+        }
+      }
+    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    l+=2;
+    }
+  if (l==lmax)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(p1[j].r.v[i],lam_2.v[i],ar);
+        vfmaeq(p1[j].i.v[i],lam_2.v[i],ai);
+        }
+      }
+    }
+  }
+
+static void Y(map2alm_kernel) (const Tb cth, const Y(Tbri) * restrict p1,
+  const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
+  const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax,
+  int njobs)
+  {
+  while (l<lmax)
+    {
+    Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tr1=vzero, ti1=vzero, tr2=vzero, ti2=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(tr1,lam_2.v[i],p1[j].r.v[i]);
+        vfmaeq(ti1,lam_2.v[i],p1[j].i.v[i]);
+        }
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(tr2,lam_1.v[i],p2[j].r.v[i]);
+        vfmaeq(ti2,lam_1.v[i],p2[j].i.v[i]);
+        }
+      vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]);
+      }
+    r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    l+=2;
+    }
+  if (l==lmax)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tre=vzero, tim=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        vfmaeq(tre,lam_2.v[i],p1[j].r.v[i]);
+        vfmaeq(tim,lam_2.v[i],p1[j].i.v[i]);
+        }
+      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
+      }
+    }
+  }
+
+static void Y(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen,
+  sharp_job *job, Y(Tbri) * restrict p1, Y(Tbri) * restrict p2, int njobs,
+  int *done)
+  {
+  int l,lmax=gen->lmax;
+  Tb lam_1,lam_2,scale;
+  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+
+  Tb corfac;
+  Y(getCorfac)(scale,&corfac,gen->cf);
+  const ylmgen_dbl2 * restrict rf = gen->rf;
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scale,minscale);
+  while (!full_ieee)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+        vfmaeq(p1[j].r.v[i],tmp,ar);
+        vfmaeq(p1[j].i.v[i],tmp,ai);
+        }
+      }
+    if (++l>lmax) break;
+    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+        vfmaeq(p2[j].r.v[i],tmp,ar);
+        vfmaeq(p2[j].i.v[i],tmp,ai);
+        }
+      }
+    if (++l>lmax) break;
+    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    if (Y(rescale)(&lam_1,&lam_2,&scale))
+      {
+      Y(getCorfac)(scale,&corfac,gen->cf);
+      full_ieee = Y(TballGt)(scale,minscale);
+      }
+    }
+  if (l>lmax) { *done=1; return; }
+
+  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
+  Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
+  }
+
+static void Y(calc_map2alm) (const Tb cth, const Tb sth,
+  const Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
+  const Y(Tbri) * restrict p2, int njobs, int *done)
+  {
+  int lmax=gen->lmax;
+  Tb lam_1,lam_2,scale;
+  int l=gen->m;
+  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
+  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
+  if (l>lmax) { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
+
+  const ylmgen_dbl2 * restrict rf = gen->rf;
+  Tb corfac;
+  Y(getCorfac)(scale,&corfac,gen->cf);
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scale,minscale);
+  while (!full_ieee)
+    {
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tre=vzero, tim=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
+        vfmaeq(tre,tmp,p1[j].r.v[i]);
+        vfmaeq(tim,tmp,p1[j].i.v[i]);
+        }
+      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
+      }
+    if (++l>lmax) { *done=1; return; }
+    Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
+    for (int j=0; j<njobs; ++j)
+      {
+      Tv tre=vzero, tim=vzero;
+      for (int i=0; i<nvec; ++i)
+        {
+        Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
+        vfmaeq(tre,tmp,p2[j].r.v[i]);
+        vfmaeq(tim,tmp,p2[j].i.v[i]);
+        }
+      alm[l*njobs+j]+=vhsum_cmplx(tre,tim);
+      }
+    if (++l>lmax) { *done=1; return; }
+    r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
+    for (int i=0; i<nvec; ++i)
+      lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
+    if (Y(rescale)(&lam_1,&lam_2,&scale))
+      {
+      Y(getCorfac)(scale,&corfac,gen->cf);
+      full_ieee = Y(TballGt)(scale,minscale);
+      }
+    }
+
+  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
+  Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs);
+  }
+
+static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
+  const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs)
+  {
+  for (int j=0; j<njobs; ++j)
+    {
+    Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
+       acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lw=vadd(rxp.v[i],rxm.v[i]);
+      vfmaeq(px[j].qr.v[i],agr,lw);
+      vfmaeq(px[j].qi.v[i],agi,lw);
+      vfmaeq(px[j].ur.v[i],acr,lw);
+      vfmaeq(px[j].ui.v[i],aci,lw);
+      }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lx=vsub(rxm.v[i],rxp.v[i]);
+      vfmseq(py[j].qr.v[i],aci,lx);
+      vfmaeq(py[j].qi.v[i],acr,lx);
+      vfmaeq(py[j].ur.v[i],agi,lx);
+      vfmseq(py[j].ui.v[i],agr,lx);
+      }
+    }
+  }
+
+static inline void Y(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
+  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
+  const dcmplx * restrict alm1, const dcmplx * restrict alm2, int njobs)
+  {
+  for (int j=0; j<njobs; ++j)
+    {
+    Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
+       acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
+    Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
+       acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lw1=vadd(r2p.v[i],r2m.v[i]);
+      Tv lx2=vsub(r1m.v[i],r1p.v[i]);
+      vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
+      vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
+      vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
+      vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
+      }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lx1=vsub(r2m.v[i],r2p.v[i]);
+      Tv lw2=vadd(r1p.v[i],r1m.v[i]);
+      vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
+      vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
+      vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
+      vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
+      }
+    }
+  }
+
+static inline void Y(saddstep2) (const Y(Tbqu) * restrict px,
+  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
+  const Tb * restrict rxm, dcmplx * restrict alm, int njobs)
+  {
+  for (int j=0; j<njobs; ++j)
+    {
+    Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lw=vadd(rxp->v[i],rxm->v[i]);
+      vfmaeq(agr,px[j].qr.v[i],lw);
+      vfmaeq(agi,px[j].qi.v[i],lw);
+      vfmaeq(acr,px[j].ur.v[i],lw);
+      vfmaeq(aci,px[j].ui.v[i],lw);
+      }
+    for (int i=0; i<nvec; ++i)
+      {
+      Tv lx=vsub(rxm->v[i],rxp->v[i]);
+      vfmseq(agr,py[j].ui.v[i],lx);
+      vfmaeq(agi,py[j].ur.v[i],lx);
+      vfmaeq(acr,py[j].qi.v[i],lx);
+      vfmseq(aci,py[j].qr.v[i],lx);
+      }
+    vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]);
+    }
+  }
+
+static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
+  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
+  int lmax, int njobs)
+  {
+  while (l<lmax)
+    {
+    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
+       fx2=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
+                        vmul(fx2,rec1p.v[i]));
+      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
+                        vmul(fx2,rec1m.v[i]));
+      }
+    Y(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
+      &alm[2*njobs*(l+1)], njobs);
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
+                        vmul(fx2,rec2p.v[i]));
+      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
+                        vmul(fx2,rec2m.v[i]));
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Y(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l], njobs);
+  }
+
+static void Y(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
+  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
+  const ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax,
+  int njobs)
+  {
+  while (l<lmax)
+    {
+    Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
+       fx2=vload(fx[l+1].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
+                        vmul(fx2,rec1p.v[i]));
+      rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
+                        vmul(fx2,rec1m.v[i]));
+      }
+    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l],njobs);
+    Y(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)],njobs);
+    fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
+    fx2=vload(fx[l+2].f[2]);
+    for (int i=0; i<nvec; ++i)
+      {
+      rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
+                        vmul(fx2,rec2p.v[i]));
+      rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
+                        vmul(fx2,rec2m.v[i]));
+      }
+    l+=2;
+    }
+  if (l==lmax)
+    Y(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l], njobs);
+  }
+
+static void Y(calc_alm2map_spin) (const Tb cth, const Ylmgen_C *gen,
+  sharp_job *job, Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2, int njobs,
+  int *done)
+  {
+  int l, lmax=gen->lmax;
+  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
+  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
+  if (l>lmax)
+   { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+
+  const ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb corfacp,corfacm;
+  Y(getCorfac)(scalep,&corfacp,gen->cf);
+  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  const dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  while (!full_ieee)
+    {
+    Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
+      &alm[2*njobs*l],njobs);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
+      &alm[2*njobs*l], njobs);
+    if (++l>lmax) break;
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      {
+      Y(getCorfac)(scalep,&corfacp,gen->cf);
+      Y(getCorfac)(scalem,&corfacm,gen->cf);
+      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      }
+    }
+
+  if (l>lmax)
+    { *done=1; return; }
+
+  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
+  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
+  Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
+    lmax, njobs);
+  }
+
+static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen,
+  sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2,
+  int njobs, int *done)
+  {
+  int l, lmax=gen->lmax;
+  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
+  Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
+  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
+  if (l>lmax) { *done=1; return; }
+  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
+
+  const ylmgen_dbl3 * restrict fx = gen->fx;
+  Tb corfacp,corfacm;
+  Y(getCorfac)(scalep,&corfacp,gen->cf);
+  Y(getCorfac)(scalem,&corfacm,gen->cf);
+  dcmplx * restrict alm=job->almtmp;
+  int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+  while (!full_ieee)
+    {
+    Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
+    Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l], njobs);
+    if (++l>lmax) { *done=1; return; }
+    Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
+    t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
+    Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l], njobs);
+    if (++l>lmax) { *done=1; return; }
+    Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
+    if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
+      {
+      Y(getCorfac)(scalep,&corfacp,gen->cf);
+      Y(getCorfac)(scalem,&corfacm,gen->cf);
+      full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale);
+      }
+    }
+
+  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
+  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
+  Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs);
+  }
+
+#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
+
+static void Y(inner_loop) (sharp_job *job, const int *ispair,
+  const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen,
+  int mi, const int *idx, int njobs)
+  {
+  const int nval=nvec*VLEN;
+  const int m = job->ainfo->mval[mi];
+  Ylmgen_prepare (gen, m);
+
+  switch (job->type)
+    {
+    case ALM2MAP:
+      {
+      if (job->spin==0)
+        {
+        int done=0;
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          if (!done)
+            {
+            Y(Tbu) cth, sth;
+
+            for (int i=0; i<nval; ++i)
+              {
+              int itot=i+ith;
+              if (itot>=ulim-llim) itot=ulim-llim-1;
+              itot=idx[itot];
+              cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+              }
+            Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
+            }
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot<ulim-llim)
+              {
+              itot=idx[itot];
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
+                complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
+                               r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
+                job->phase[phas_idx] = r1+r2;
+                if (ispair[itot])
+                  job->phase[phas_idx+1] = r1-r2;
+                }
+              }
+            }
+          }
+        }
+      else
+        {
+        int done=0;
+        for (int ith=0; ith<ulim-llim; ith+=nval)
+          {
+          Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
+          if (!done)
+            {
+            Y(Tbu) cth;
+
+            for (int i=0; i<nval; ++i)
+              {
+              int itot=i+ith;
+              if (itot>=ulim-llim) itot=ulim-llim-1;
+              itot=idx[itot];
+              cth.s[i]=cth_[itot];
+              }
+            Y(calc_alm2map_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
+            }
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot<ulim-llim)
+              {
+              itot=idx[itot];
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
+                complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
+                               q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
+                               u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
+                               u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
+                job->phase[phas_idx] = q1+q2;
+                job->phase[phas_idx+2] = u1+u2;
+                if (ispair[itot])
+                  {
+                  dcmplx *phQ = &(job->phase[phas_idx+1]),
+                         *phU = &(job->phase[phas_idx+3]);
+                  *phQ = q1-q2;
+                  *phU = u1-u2;
+                  if ((gen->mhi-gen->m+gen->s)&1)
+                    { *phQ=-(*phQ); *phU=-(*phU); }
+                  }
+                }
+              }
+            }
+          }
+        }
+      break;
+      }
+    case ALM2MAP_DERIV1:
+      break;
+    case MAP2ALM:
+      {
+      if (job->spin==0)
+        {
+        int done=0;
+        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
+          {
+          Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth, sth;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            itot=idx[itot];
+            cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
+            if (i+ith<ulim-llim)
+              {
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 2*(j+njobs*(itot*job->ainfo->nm+mi));
+                dcmplx ph1=job->phase[phas_idx];
+                dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
+                p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
+                p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
+                }
+              }
+            }
+          Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
+          }
+        }
+      else
+        {
+        int done=0;
+        for (int ith=0; (ith<ulim-llim)&&(!done); ith+=nval)
+          {
+          Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
+          Y(Tbu) cth;
+
+          for (int i=0; i<nval; ++i)
+            {
+            int itot=i+ith;
+            if (itot>=ulim-llim) itot=ulim-llim-1;
+            itot=idx[itot];
+            cth.s[i]=cth_[itot];
+            if (i+ith<ulim-llim)
+              {
+              for (int j=0; j<njobs; ++j)
+                {
+                int phas_idx = 4*(j+njobs*(itot*job->ainfo->nm+mi));
+                dcmplx p1Q=job->phase[phas_idx],
+                       p1U=job->phase[phas_idx+2],
+                       p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
+                       p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
+                if ((gen->mhi-gen->m+gen->s)&1)
+                  { p2Q=-p2Q; p2U=-p2U; }
+                p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
+                p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
+                p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
+                p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
+                }
+              }
+            }
+          Y(calc_map2alm_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done);
+          }
+        }
+      break;
+      }
+    }
+  }
+
+#undef VZERO
diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c
new file mode 100644
index 0000000..c095484
--- /dev/null
+++ b/libsharp/sharp_geomhelpers.c
@@ -0,0 +1,222 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_geomhelpers.c
+ *  Spherical transform library
+ *
+ *  Copyright (C) 2006-2011 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <math.h>
+#include "sharp_geomhelpers.h"
+#include "c_utils.h"
+
+void sharp_make_healpix_geom_info (int nside, int stride,
+  sharp_geom_info **geom_info)
+  {
+  double *weight=RALLOC(double,2*nside);
+  SET_ARRAY(weight,0,2*nside,1);
+  sharp_make_weighted_healpix_geom_info (nside, stride, weight, geom_info);
+  DEALLOC(weight);
+  }
+
+void sharp_make_weighted_healpix_geom_info (int nside, int stride,
+  const double *weight, sharp_geom_info **geom_info)
+  {
+  const double pi=3.141592653589793238462643383279502884197;
+  ptrdiff_t npix=(ptrdiff_t)nside*nside*12;
+  ptrdiff_t ncap=2*(ptrdiff_t)nside*(nside-1);
+  int nrings=4*nside-1;
+
+  double *theta=RALLOC(double,nrings);
+  double *weight_=RALLOC(double,nrings);
+  int *nph=RALLOC(int,nrings);
+  double *phi0=RALLOC(double,nrings);
+  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
+  int *stride_=RALLOC(int,nrings);
+  for (int m=0; m<nrings; ++m)
+    {
+    int ring=m+1;
+    ptrdiff_t northring = (ring>2*nside) ? 4*nside-ring : ring;
+    stride_[m] = stride;
+    if (northring < nside)
+      {
+      theta[m] = 2*asin(northring/(sqrt(6.)*nside));
+      nph[m] = 4*northring;
+      phi0[m] = pi/nph[m];
+      ofs[m] = 2*northring*(northring-1)*stride;
+      }
+    else
+      {
+      double fact1 = (8.*nside)/npix;
+      double costheta = (2*nside-northring)*fact1;
+      theta[m] = acos(costheta);
+      nph[m] = 4*nside;
+      if ((northring-nside) & 1)
+        phi0[m] = 0;
+      else
+        phi0[m] = pi/nph[m];
+      ofs[m] = (ncap + (northring-nside)*nph[m])*stride;
+      }
+    if (northring != ring) /* southern hemisphere */
+      {
+      theta[m] = pi-theta[m];
+      ofs[m] = (npix - nph[m])*stride - ofs[m];
+      }
+    weight_[m]=4.*pi/npix*weight[northring-1];
+    }
+
+  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight_,
+    geom_info);
+
+  DEALLOC(theta);
+  DEALLOC(weight_);
+  DEALLOC(nph);
+  DEALLOC(phi0);
+  DEALLOC(ofs);
+  DEALLOC(stride_);
+  }
+
+static void gauleg (double x1, double x2, double *x, double *w, int n)
+  {
+  const double pi = 3.141592653589793238462643383279502884197;
+  const double eps = 3.0E-14;
+
+  int m = (n+1)/2;
+  double xm = 0.5*(x2+x1);
+  double xl = 0.5*(x2-x1);
+  for(int i=1; i<=m; ++i)
+    {
+    double z = cos(pi*(i-0.25)/(n+0.5));
+    double pp;
+    int dobreak=0;
+    while(1)
+      {
+      double p1 = 1.0, p2 = 0.0;
+      double z1 = z;
+      int j;
+      for(j=1; j<=n; ++j)
+        {
+        double p3 = p2;
+        p2 = p1;
+        p1 = ((2*j-1)*z*p2-(j-1)*p3)/j;
+        }
+      pp = n*(z*p1-p2)/(z*z-1);
+      z = z1 - p1/pp;
+      if (dobreak) break;
+      if (fabs(z-z1) <= eps) dobreak=1;
+      }
+    x[i-1] = xm - xl*z;
+    x[n-i] = xm + xl*z;
+    w[i-1] = w[n-i] = 2*xl/((1-z*z)*pp*pp);
+    }
+  }
+
+static void makeweights (int bw, double *weights)
+  {
+  const double pi = 3.141592653589793238462643383279502884197;
+  const double fudge = pi/(4*bw);
+  for (int j=0; j<2*bw; ++j)
+    {
+    double tmpsum = 0;
+    for (int k=0; k<bw; ++k)
+      tmpsum += 1./(2*k+1) * sin((2*j+1)*(2*k+1)*fudge);
+    tmpsum *= sin((2*j+1)*fudge);
+    tmpsum *= 2./bw;
+    weights[j] = tmpsum;
+    /* weights[j + 2*bw] = tmpsum * sin((2*j+1)*fudge); */
+    }
+  }
+
+void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
+  int stride_lat, sharp_geom_info **geom_info)
+  {
+  const double pi=3.141592653589793238462643383279502884197;
+
+  double *theta=RALLOC(double,nrings);
+  double *weight=RALLOC(double,nrings);
+  int *nph=RALLOC(int,nrings);
+  double *phi0=RALLOC(double,nrings);
+  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
+  int *stride_=RALLOC(int,nrings);
+
+  gauleg(-1,1,theta,weight,nrings);
+
+  for (int m=0; m<nrings; ++m)
+    {
+    theta[m] = acos(theta[m]);
+    nph[m]=nphi;
+    phi0[m]=0;
+    ofs[m]=(ptrdiff_t)m*stride_lat;
+    stride_[m]=stride_lon;
+    weight[m]*=2*pi/nphi;
+    }
+
+  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight,
+    geom_info);
+
+  DEALLOC(theta);
+  DEALLOC(weight);
+  DEALLOC(nph);
+  DEALLOC(phi0);
+  DEALLOC(ofs);
+  DEALLOC(stride_);
+  }
+
+void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info)
+  {
+  const double pi=3.141592653589793238462643383279502884197;
+
+  double *theta=RALLOC(double,nrings);
+  double *weight=RALLOC(double,nrings);
+  int *nph=RALLOC(int,nrings);
+  double *phi0_=RALLOC(double,nrings);
+  ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings);
+  int *stride_=RALLOC(int,nrings);
+
+  UTIL_ASSERT((nrings&1)==0,
+    "Even number of rings needed for equidistant grid!");
+  makeweights(nrings/2,weight);
+  for (int m=0; m<nrings; ++m)
+    {
+    theta[m] = (m+0.5)*pi/nrings;
+    nph[m]=nphi;
+    phi0_[m]=phi0;
+    ofs[m]=(ptrdiff_t)m*stride_lat;
+    stride_[m]=stride_lon;
+    weight[m]*=2*pi/nphi;
+    }
+
+  sharp_make_geom_info (nrings, nph, ofs, stride_, phi0_, theta, weight,
+    geom_info);
+
+  DEALLOC(theta);
+  DEALLOC(weight);
+  DEALLOC(nph);
+  DEALLOC(phi0_);
+  DEALLOC(ofs);
+  DEALLOC(stride_);
+  }
diff --git a/libsharp/sharp_geomhelpers.h b/libsharp/sharp_geomhelpers.h
new file mode 100644
index 0000000..8f20f74
--- /dev/null
+++ b/libsharp/sharp_geomhelpers.h
@@ -0,0 +1,82 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_geomhelpers.h
+ *  SHARP helper function for the creation of grid geometries
+ *
+ *  Copyright (C) 2006-2011 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_SHARP_GEOMHELPERS_H
+#define PLANCK_SHARP_GEOMHELPERS_H
+
+#include "sharp.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Creates a geometry information describing a HEALPix map with an
+    Nside parameter \a nside.
+    \ingroup geominfogroup */
+void sharp_make_healpix_geom_info (int nside, int stride,
+  sharp_geom_info **geom_info);
+
+/*! Creates a geometry information describing a HEALPix map with an
+    Nside parameter \a nside. \a weight contains the relative ring
+    weights and must have \a 2*nside entries.
+    \ingroup geominfogroup */
+void sharp_make_weighted_healpix_geom_info (int nside, int stride,
+  const double *weight, sharp_geom_info **geom_info);
+
+/*! Creates a geometry information describing a Gaussian map with \a nrings
+    iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
+    pixel in each ring is 0. The index difference between two adjacent pixels
+    in an iso-latitude ring is \a stride_lon, the index difference between the
+    two start pixels in consecutive iso-latitude rings is \a stride_lat.
+    \ingroup geominfogroup */
+void sharp_make_gauss_geom_info (int nrings, int nphi, int stride_lon,
+  int stride_lat, sharp_geom_info **geom_info);
+
+/*! Creates a geometry information describing an ECP map with \a nrings
+    iso-latitude rings and \a nphi pixels per ring. The azimuth of the first
+    pixel in each ring is \a phi0 (in radians). The index difference between
+    two adjacent pixels in an iso-latitude ring is \a stride_lon, the index
+    difference between the two start pixels in consecutive iso-latitude rings
+    is \a stride_lat.
+    \note The spacing of pixel centers is equidistant in colatitude and
+      longitude.
+    \note \a nrings must be an even number.
+    \note The sphere is pixelized in a way that the colatitude of the first ring
+      is \a 0.5*(pi/nrings). There are no pixel centers at the poles.
+    \ingroup geominfogroup */
+void sharp_make_ecp_geom_info (int nrings, int nphi, double phi0,
+  int stride_lon, int stride_lat, sharp_geom_info **geom_info);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsharp/sharp_inchelper1.inc.c b/libsharp/sharp_inchelper1.inc.c
new file mode 100644
index 0000000..74de2ee
--- /dev/null
+++ b/libsharp/sharp_inchelper1.inc.c
@@ -0,0 +1,57 @@
+#define Tb CONCAT2(Tb,nvec)
+#define Y(arg) CONCAT2(arg,nvec)
+#include "sharp_core_inc.c"
+#if (MAXJOB_SPECIAL<6)
+#include "sharp_core_inc3.c"
+#endif
+
+#if (MAXJOB_SPECIAL>=1)
+#define njobs 1
+#define Z(arg) CONCAT3(arg,nvec,njobs)
+#include "sharp_core_inc2.c"
+#undef Z
+#undef njobs
+#endif
+
+#if (MAXJOB_SPECIAL>=2)
+#define njobs 2
+#define Z(arg) CONCAT3(arg,nvec,njobs)
+#include "sharp_core_inc2.c"
+#undef Z
+#undef njobs
+#endif
+
+#if (MAXJOB_SPECIAL>=3)
+#define njobs 3
+#define Z(arg) CONCAT3(arg,nvec,njobs)
+#include "sharp_core_inc2.c"
+#undef Z
+#undef njobs
+#endif
+
+#if (MAXJOB_SPECIAL>=4)
+#define njobs 4
+#define Z(arg) CONCAT3(arg,nvec,njobs)
+#include "sharp_core_inc2.c"
+#undef Z
+#undef njobs
+#endif
+
+#if (MAXJOB_SPECIAL>=5)
+#define njobs 5
+#define Z(arg) CONCAT3(arg,nvec,njobs)
+#include "sharp_core_inc2.c"
+#undef Z
+#undef njobs
+#endif
+
+#if (MAXJOB_SPECIAL>=6)
+#define njobs 6
+#define Z(arg) CONCAT3(arg,nvec,njobs)
+#include "sharp_core_inc2.c"
+#undef Z
+#undef njobs
+#endif
+
+#undef Y
+#undef Tb
diff --git a/libsharp/sharp_mpi.c b/libsharp/sharp_mpi.c
new file mode 100644
index 0000000..1827a06
--- /dev/null
+++ b/libsharp/sharp_mpi.c
@@ -0,0 +1,286 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_mpi.c
+ *  Functionality only needed for MPI-parallel transforms
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifdef USE_MPI
+
+#include "sharp_mpi.h"
+
+typedef struct
+  {
+  int ntasks;     /* number of tasks */
+  int mytask;     /* own task number */
+  MPI_Comm comm;  /* communicator to use */
+
+  int *nm;        /* number of m values on every task */
+  int *ofs_m;     /* accumulated nm */
+  int nmtotal;    /* total number of m values (must be mmax+1) */
+  int *mval;      /* array containing all m values of task 0, task 1 etc. */
+  int mmax;
+  int nph;
+
+  int *npair;     /* number of ring pairs on every task */
+  int *ofs_pair;  /* accumulated npair */
+  int npairtotal; /* total number of ring pairs */
+
+  double *theta;  /* theta of first ring of every pair on task 0, task 1 etc. */
+  int *ispair;    /* is this really a pair? */
+
+  int *almcount, *almdisp, *mapcount, *mapdisp; /* for all2all communication */
+  } sharp_mpi_info;
+
+static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job,
+  sharp_mpi_info *minfo)
+  {
+  minfo->comm = comm;
+  MPI_Comm_size (comm, &minfo->ntasks);
+  MPI_Comm_rank (comm, &minfo->mytask);
+
+  minfo->nm=RALLOC(int,minfo->ntasks);
+  MPI_Allgather ((int *)(&job->ainfo->nm),1,MPI_INT,minfo->nm,1,MPI_INT,comm);
+  minfo->ofs_m=RALLOC(int,minfo->ntasks+1);
+  minfo->ofs_m[0]=0;
+  for (int i=1; i<=minfo->ntasks; ++i)
+    minfo->ofs_m[i] = minfo->ofs_m[i-1]+minfo->nm[i-1];
+  minfo->nmtotal=minfo->ofs_m[minfo->ntasks];
+  minfo->mval=RALLOC(int,minfo->nmtotal);
+  MPI_Allgatherv(job->ainfo->mval, job->ainfo->nm, MPI_INT, minfo->mval,
+    minfo->nm, minfo->ofs_m, MPI_INT, comm);
+
+  minfo->mmax=sharp_get_mmax(minfo->mval,minfo->nmtotal);
+
+  minfo->npair=RALLOC(int,minfo->ntasks);
+  MPI_Allgather ((int *)(&job->ginfo->npairs), 1, MPI_INT, minfo->npair, 1,
+    MPI_INT, comm);
+  minfo->ofs_pair=RALLOC(int,minfo->ntasks+1);
+  minfo->ofs_pair[0]=0;
+  for (int i=1; i<=minfo->ntasks; ++i)
+    minfo->ofs_pair[i] = minfo->ofs_pair[i-1]+minfo->npair[i-1];
+  minfo->npairtotal=minfo->ofs_pair[minfo->ntasks];
+
+  double *theta_tmp=RALLOC(double,job->ginfo->npairs);
+  int *ispair_tmp=RALLOC(int,job->ginfo->npairs);
+  for (int i=0; i<job->ginfo->npairs; ++i)
+    {
+    theta_tmp[i]=job->ginfo->pair[i].r1.theta;
+    ispair_tmp[i]=job->ginfo->pair[i].r2.nph>0;
+    }
+  minfo->theta=RALLOC(double,minfo->npairtotal);
+  minfo->ispair=RALLOC(int,minfo->npairtotal);
+  MPI_Allgatherv(theta_tmp, job->ginfo->npairs, MPI_DOUBLE, minfo->theta,
+    minfo->npair, minfo->ofs_pair, MPI_DOUBLE, comm);
+  MPI_Allgatherv(ispair_tmp, job->ginfo->npairs, MPI_INT, minfo->ispair,
+    minfo->npair, minfo->ofs_pair, MPI_INT, comm);
+  DEALLOC(theta_tmp);
+  DEALLOC(ispair_tmp);
+
+  minfo->nph=2*job->nmaps*job->ntrans;
+
+  minfo->almcount=RALLOC(int,minfo->ntasks);
+  minfo->almdisp=RALLOC(int,minfo->ntasks+1);
+  minfo->mapcount=RALLOC(int,minfo->ntasks);
+  minfo->mapdisp=RALLOC(int,minfo->ntasks+1);
+  minfo->almdisp[0]=minfo->mapdisp[0]=0;
+  for (int i=0; i<minfo->ntasks; ++i)
+    {
+    minfo->almcount[i] = 2*minfo->nph*minfo->nm[minfo->mytask]*minfo->npair[i];
+    minfo->almdisp[i+1] = minfo->almdisp[i]+minfo->almcount[i];
+    minfo->mapcount[i] = 2*minfo->nph*minfo->nm[i]*minfo->npair[minfo->mytask];
+    minfo->mapdisp[i+1] = minfo->mapdisp[i]+minfo->mapcount[i];
+    }
+  }
+
+static void sharp_destroy_mpi_info (sharp_mpi_info *minfo)
+  {
+  DEALLOC(minfo->nm);
+  DEALLOC(minfo->ofs_m);
+  DEALLOC(minfo->mval);
+  DEALLOC(minfo->npair);
+  DEALLOC(minfo->ofs_pair);
+  DEALLOC(minfo->theta);
+  DEALLOC(minfo->ispair);
+  DEALLOC(minfo->almcount);
+  DEALLOC(minfo->almdisp);
+  DEALLOC(minfo->mapcount);
+  DEALLOC(minfo->mapdisp);
+  }
+
+static void sharp_communicate_alm2map (const sharp_mpi_info *minfo, dcmplx **ph)
+  {
+  dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
+
+  MPI_Alltoallv (*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,phas_tmp,
+    minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,minfo->comm);
+
+  DEALLOC(*ph);
+  ALLOC(*ph,dcmplx,minfo->nph*minfo->npair[minfo->mytask]*minfo->nmtotal);
+
+  for (int task=0; task<minfo->ntasks; ++task)
+    for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
+      for (int mi=0; mi<minfo->nm[task]; ++mi)
+        {
+        int m = minfo->mval[mi+minfo->ofs_m[task]];
+        int o1 = minfo->nph*(th*(minfo->mmax+1) + m);
+        int o2 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
+        for (int i=0; i<minfo->nph; ++i)
+          (*ph)[o1+i] = phas_tmp[o2+i];
+        }
+  DEALLOC(phas_tmp);
+  }
+
+static void sharp_communicate_map2alm (const sharp_mpi_info *minfo, dcmplx **ph)
+  {
+  dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2);
+
+  for (int task=0; task<minfo->ntasks; ++task)
+    for (int th=0; th<minfo->npair[minfo->mytask]; ++th)
+      for (int mi=0; mi<minfo->nm[task]; ++mi)
+        {
+        int m = minfo->mval[mi+minfo->ofs_m[task]];
+        int o1 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]);
+        int o2 = minfo->nph*(th*(minfo->mmax+1) + m);
+        for (int i=0; i<minfo->nph; ++i)
+          phas_tmp[o1+i] = (*ph)[o2+i];
+        }
+
+  DEALLOC(*ph);
+  ALLOC(*ph,dcmplx,minfo->nph*minfo->nm[minfo->mytask]*minfo->npairtotal);
+
+  MPI_Alltoallv (phas_tmp,minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,
+    *ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,minfo->comm);
+
+  DEALLOC(phas_tmp);
+  }
+
+static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta,
+  int nmfull, int nthetafull)
+  {
+  ptrdiff_t phase_size = (job->type==MAP2ALM) ?
+    (ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull;
+  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size);
+  }
+
+static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo)
+  {
+  if (job->type != MAP2ALM)
+    sharp_communicate_alm2map (minfo,&job->phase);
+  }
+
+static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo)
+  {
+  if (job->type == MAP2ALM)
+    sharp_communicate_map2alm (minfo,&job->phase);
+  }
+
+void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm)
+  {
+  double timer=wallTime();
+  int ntasks;
+  MPI_Comm_size(comm, &ntasks);
+  if (ntasks==1) /* fall back to scalar implementation */
+    { sharp_execute_job (job); return; }
+
+  int lmax = job->ainfo->lmax;
+
+  job->norm_l = Ylmgen_get_norm (lmax, job->spin);
+
+  sharp_mpi_info minfo;
+  sharp_make_mpi_info(comm, job, &minfo);
+
+/* clear output arrays if requested */
+  init_output (job);
+
+  alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1,
+    minfo.npairtotal);
+
+  double *cth = RALLOC(double,minfo.npairtotal),
+         *sth = RALLOC(double,minfo.npairtotal);
+  idxhelper *stmp = RALLOC(idxhelper,minfo.npairtotal);
+  for (int i=0; i<minfo.npairtotal; ++i)
+    {
+    cth[i] = cos(minfo.theta[i]);
+    sth[i] = sin(minfo.theta[i]);
+    stmp[i].s=sth[i];
+    stmp[i].i=i;
+    }
+  qsort (stmp,minfo.npairtotal,sizeof(idxhelper),idx_compare);
+  int *idx = RALLOC(int,minfo.npairtotal);
+  for (int i=0; i<minfo.npairtotal; ++i)
+    idx[i]=stmp[i].i;
+  DEALLOC(stmp);
+
+/* map->phase where necessary */
+  map2phase (job, minfo.mmax, 0, job->ginfo->npairs);
+
+  map2alm_comm (job, &minfo);
+
+#pragma omp parallel
+{
+  sharp_job ljob = *job;
+  Ylmgen_C generator;
+  Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin);
+  alloc_almtmp(&ljob,lmax);
+
+#pragma omp for schedule(dynamic,1)
+  for (int mi=0; mi<job->ainfo->nm; ++mi)
+    {
+/* alm->alm_tmp where necessary */
+    alm2almtmp (&ljob, lmax, mi);
+
+/* inner conversion loop */
+    inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal,
+      &generator, mi, idx);
+
+/* alm_tmp->alm where necessary */
+    almtmp2alm (&ljob, lmax, mi);
+    }
+
+  Ylmgen_destroy(&generator);
+  dealloc_almtmp(&ljob);
+
+#pragma omp critical
+  job->opcnt+=ljob.opcnt;
+} /* end of parallel region */
+
+  alm2map_comm (job, &minfo);
+
+/* phase->map where necessary */
+  phase2map (job, minfo.mmax, 0, job->ginfo->npairs);
+
+  DEALLOC(cth);
+  DEALLOC(sth);
+  DEALLOC(idx);
+  DEALLOC(job->norm_l);
+  dealloc_phase (job);
+  sharp_destroy_mpi_info(&minfo);
+  job->time=wallTime()-timer;
+  }
+
+#endif
diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h
new file mode 100644
index 0000000..3bef24a
--- /dev/null
+++ b/libsharp/sharp_mpi.h
@@ -0,0 +1,48 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_mpi.h
+ *  Interface for the spherical transform library with MPI support.
+ *
+ *  Copyright (C) 2011,2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_SHARP_MPI_H
+#define PLANCK_SHARP_MPI_H
+
+#include <mpi.h>
+#include "sharp.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libsharp/sharp_test.c b/libsharp/sharp_test.c
new file mode 100644
index 0000000..6bcd253
--- /dev/null
+++ b/libsharp/sharp_test.c
@@ -0,0 +1,243 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_test.c
+    Accuracy test for libsharp's map analysis.
+
+    This program first generates a_lm coefficients up to
+    a user-specified lmax (with mmax=lmax); where applicable, the
+    real and imaginary parts of the coefficients are uniform
+    random numbers of the interval [-1;1[.
+    Afterwards, the random a_lm are converted to a map.
+    This map is analyzed (optionally using an iterative scheme
+    with a user-supplied number of steps).
+    After every iteration, the code then outputs the RMS of the residual a_lm
+    (i.e. the difference between the current and original a_lm), divided by
+    the RMS of the original a_lm, as well as the maximum absolute change of any
+    real or imaginary part between the current and original a_lm.
+
+    This operation can be performed for several different pixelisations:
+      - a Gaussian with the minimal number of rings for exact analysis
+        and a user-defined ring resolution
+      - an ECP grid with the minimal number of rings for exact analysis
+        and a user-defined ring resolution
+      - a Healpix grid with a user-defined Nside parameter.
+
+    The user can specify the spin of the desired transform.
+
+    Copyright (C) 2006-2012 Max-Planck-Society
+    \author Martin Reinecke
+*/
+
+#include <stdio.h>
+#include <string.h>
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
+#include "sharp.h"
+#include "sharp_geomhelpers.h"
+#include "sharp_almhelpers.h"
+#include "c_utils.h"
+#include "sharp_core.h"
+
+typedef complex double dcmplx;
+
+static double drand (double min, double max)
+  { return min + (max-min)*rand()/(RAND_MAX+1.0); }
+
+static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
+  {
+  for (int mi=0;mi<helper->nm; ++mi)
+    {
+    int m=helper->mval[mi];
+    for (int l=m;l<=helper->lmax; ++l)
+      {
+      if ((l<spin)&&(m<spin))
+        alm[sharp_alm_index(helper,l,mi)] = 0.;
+      else
+        {
+        double rv = drand(-1,1);
+        double iv = (m==0) ? 0 : drand(-1,1);
+        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
+        }
+      }
+    }
+  }
+
+static void measure_errors (dcmplx **alm, dcmplx **alm2,
+  ptrdiff_t nalms, int ncomp)
+  {
+  for (int i=0; i<ncomp; ++i)
+    {
+    double sum=0, sum2=0, maxdiff=0;
+    for (ptrdiff_t m=0; m<nalms; ++m)
+      {
+      double x=creal(alm[i][m])-creal(alm2[i][m]),
+             y=cimag(alm[i][m])-cimag(alm2[i][m]);
+      sum+=x*x+y*y;
+      sum2+=creal(alm[i][m])*creal(alm[i][m])+cimag(alm[i][m])*cimag(alm[i][m]);
+      if (fabs(x)>maxdiff) maxdiff=fabs(x);
+      if (fabs(y)>maxdiff) maxdiff=fabs(y);
+      }
+    sum=sqrt(sum/nalms);
+    sum2=sqrt(sum2/nalms);
+    printf("component %i: rms %e, maxerr %e\n",i, sum/sum2, maxdiff);
+    }
+  }
+
+static void map2alm_iter (sharp_geom_info *tinfo, double **map,
+  dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
+  ptrdiff_t npix, ptrdiff_t nalms, int spin, int ntrans, int niter)
+  {
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+
+  sharp_job job;
+  sharpd_build_job(&job,MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+  sharp_execute_job(&job);
+  printf("wall time for map2alm: %fs\n",job.time);
+  printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
+  measure_errors(alm_orig,alm,nalms,ncomp);
+
+  for (int iter=0; iter<niter; ++iter)
+    {
+    double **map2;
+    ALLOC2D(map2,double,ncomp,npix);
+    printf ("\niteration %i:\n", iter+1);
+    sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map2[0],tinfo,alms,ntrans);
+    sharp_execute_job(&job);
+    printf("wall time for alm2map: %fs\n",job.time);
+    printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
+    for (int i=0; i<ncomp; ++i)
+      for (ptrdiff_t m=0; m<npix; ++m)
+        map2[i][m] = map[i][m]-map2[i][m];
+
+    sharpd_build_job(&job,MAP2ALM,spin,1,&alm[0],&map2[0],tinfo,alms,ntrans);
+    sharp_execute_job(&job);
+    printf("wall time for map2alm: %fs\n",job.time);
+    printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
+    DEALLOC2D(map2);
+    measure_errors(alm_orig,alm,nalms,ncomp);
+    }
+
+  sharp_destroy_alm_info(alms);
+  }
+
+static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
+  ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
+  {
+  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  double **map;
+  ALLOC2D(map,double,ncomp,npix);
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+
+  srand(4);
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,ncomp,nalms);
+  for (int i=0; i<ncomp; ++i)
+    random_alm(alm[i],alms,spin);
+
+  dcmplx **alm2;
+  ALLOC2D(alm2,dcmplx,ncomp,nalms);
+
+  sharp_job job;
+  printf ("\niteration 0:\n");
+  sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+  sharp_execute_job(&job);
+  printf("wall time for alm2map: %fs\n",job.time);
+  printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time);
+
+  map2alm_iter(tinfo,map,alm,alm2,lmax,mmax,npix,nalms,spin,ntrans,niter);
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
+  DEALLOC2D(alm2);
+
+  sharp_destroy_alm_info(alms);
+  }
+
+int main(int argc, char **argv)
+  {
+#ifdef USE_MPI
+  MPI_Init(NULL,NULL);
+#endif
+  module_startup_c("sharp_test",argc,7,
+    "<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",1);
+
+  int lmax=atoi(argv[2]);
+  int niter=atoi(argv[4]);
+  int spin=atoi(argv[5]);
+  int ntrans=atoi(argv[6]);
+
+  printf("Testing map analysis accuracy.\n");
+  printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
+
+  sharp_geom_info *tinfo;
+  if (strcmp(argv[1],"gauss")==0)
+    {
+    int nrings=lmax+1;
+    int ppring=atoi(argv[3]);
+    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+    printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
+          nrings,ppring,(long)npix);
+    sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
+    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
+    sharp_destroy_geom_info(tinfo);
+    }
+  else if (strcmp(argv[1],"ecp")==0)
+    {
+    int nrings=2*lmax+2;
+    int ppring=atoi(argv[3]);
+    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+    printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
+          nrings,ppring,(long)npix);
+    sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
+    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
+    sharp_destroy_geom_info(tinfo);
+    }
+  else if (strcmp(argv[1],"healpix")==0)
+    {
+    int nside=atoi(argv[3]);
+    if (nside<1) nside=1;
+    ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
+    printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
+          nside,(long)npix);
+    sharp_make_healpix_geom_info (nside, 1, &tinfo);
+    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
+    sharp_destroy_geom_info(tinfo);
+    }
+  else
+    UTIL_FAIL("unknown grid geometry");
+
+#ifdef USE_MPI
+  MPI_Finalize();
+#endif
+  return 0;
+  }
diff --git a/libsharp/sharp_test_mpi.c b/libsharp/sharp_test_mpi.c
new file mode 100644
index 0000000..e8bd79b
--- /dev/null
+++ b/libsharp/sharp_test_mpi.c
@@ -0,0 +1,354 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file sharp_test_mpi.c
+    Accuracy test for libsharp's map analysis with MPI support.
+
+    This program first generates a_lm coefficients up to
+    a user-specified lmax (with mmax=lmax); where applicable, the
+    real and imaginary parts of the coefficients are uniform
+    random numbers of the interval [-1;1[.
+    Afterwards, the random a_lm are converted to a map.
+    This map is analyzed (optionally using an iterative scheme
+    with a user-supplied number of steps).
+    After every iteration, the code then outputs the RMS of the residual a_lm
+    (i.e. the difference between the current and original a_lm), divided by
+    the RMS of the original a_lm, as well as the maximum absolute change of any
+    real or imaginary part between the current and original a_lm.
+
+    This operation can be performed for several different pixelisations:
+      - a Gaussian with the minimal number of rings for exact analysis
+        and a user-defined ring resolution
+      - an ECP grid with the minimal number of rings for exact analysis
+        and a user-defined ring resolution
+      - a Healpix grid with a user-defined Nside parameter.
+
+    The user can specify the spin of the desired transform.
+
+    Copyright (C) 2006-2012 Max-Planck-Society
+    \author Martin Reinecke
+*/
+
+#ifdef USE_MPI
+
+#include <stdio.h>
+#include <string.h>
+#include "sharp_mpi.h"
+#include "sharp_geomhelpers.h"
+#include "sharp_almhelpers.h"
+#include "c_utils.h"
+#include "walltime_c.h"
+#include "sharp_core.h"
+
+typedef complex double dcmplx;
+
+int ntasks, mytask;
+
+static unsigned long long totalops (unsigned long long val)
+  {
+  unsigned long long tmp;
+  MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+  return tmp;
+  }
+
+static double maxTime (double val)
+  {
+  double tmp;
+  MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  return tmp;
+  }
+
+static double drand (double min, double max)
+  { return min + (max-min)*rand()/(RAND_MAX+1.0); }
+
+static ptrdiff_t get_nalms(const sharp_alm_info *ainfo)
+  {
+  ptrdiff_t res=0;
+  for (int i=0; i<ainfo->nm; ++i)
+    res += ainfo->lmax-ainfo->mval[i]+1;
+  return res;
+  }
+
+static ptrdiff_t get_npix(const sharp_geom_info *ginfo)
+  {
+  ptrdiff_t res=0;
+  for (int i=0; i<ginfo->npairs; ++i)
+    {
+    res += ginfo->pair[i].r1.nph;
+    if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph;
+    }
+  return res;
+  }
+
+static void reduce_alm_info(sharp_alm_info *ainfo)
+  {
+  int nmnew=0;
+  ptrdiff_t ofs = 0;
+  for (int i=mytask; i<ainfo->nm; i+=ntasks,++nmnew)
+    {
+    ainfo->mval[nmnew]=ainfo->mval[i];
+    ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew];
+    ofs+=ainfo->lmax-ainfo->mval[nmnew]+1;
+    }
+  ainfo->nm=nmnew;
+  }
+
+static void reduce_geom_info(sharp_geom_info *ginfo)
+  {
+  int npairsnew=0;
+  ptrdiff_t ofs = 0;
+  for (int i=mytask; i<ginfo->npairs; i+=ntasks,++npairsnew)
+    {
+    ginfo->pair[npairsnew]=ginfo->pair[i];
+    ginfo->pair[npairsnew].r1.ofs=ofs;
+    ofs+=ginfo->pair[npairsnew].r1.nph;
+    ginfo->pair[npairsnew].r2.ofs=ofs;
+    if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph;
+    }
+  ginfo->npairs=npairsnew;
+  }
+
+static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin)
+  {
+  static int cnt=0;
+  ++cnt;
+  for (int mi=0;mi<helper->nm; ++mi)
+    {
+    int m=helper->mval[mi];
+    srand(1234567*cnt+8912*m);
+    for (int l=m;l<=helper->lmax; ++l)
+      {
+      if ((l<spin)&&(m<spin))
+        alm[sharp_alm_index(helper,l,mi)] = 0.;
+      else
+        {
+        double rv = drand(-1,1);
+        double iv = (m==0) ? 0 : drand(-1,1);
+        alm[sharp_alm_index(helper,l,mi)] = rv+_Complex_I*iv;
+        }
+      }
+    }
+  }
+
+static void measure_errors (dcmplx **alm, dcmplx **alm2,
+  const sharp_alm_info *ainfo, int ncomp)
+  {
+  long nalms=get_nalms(ainfo), nalms_tot;
+  MPI_Allreduce(&nalms,&nalms_tot,1,MPI_LONG,MPI_SUM,MPI_COMM_WORLD);
+
+  for (int i=0; i<ncomp; ++i)
+    {
+    double sum=0, sum2=0, maxdiff=0, sumtot, sum2tot, maxdifftot;
+    for (int mi=0; mi<ainfo->nm; ++mi)
+      {
+      int m=ainfo->mval[mi];
+      for (int l=m; l<=ainfo->lmax; ++l)
+        {
+        ptrdiff_t idx=sharp_alm_index(ainfo,l,mi);
+        double x=creal(alm[i][idx])-creal(alm2[i][idx]),
+               y=cimag(alm[i][idx])-cimag(alm2[i][idx]);
+        sum+=x*x+y*y;
+        sum2+=creal(alm[i][idx])*creal(alm[i][idx])
+             +cimag(alm[i][idx])*cimag(alm[i][idx]);
+        if (fabs(x)>maxdiff) maxdiff=fabs(x);
+        if (fabs(y)>maxdiff) maxdiff=fabs(y);
+        }
+      }
+
+    MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+    MPI_Allreduce(&sum2,&sum2tot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+    MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD);
+    sumtot=sqrt(sumtot/nalms_tot);
+    sum2tot=sqrt(sum2tot/nalms_tot);
+    if (mytask==0)
+      printf("component %i: rms %e, maxerr %e\n",i, sumtot/sum2tot, maxdifftot);
+    }
+  }
+
+static void map2alm_iter (sharp_geom_info *tinfo, double **map,
+  dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax,
+  ptrdiff_t npix, int spin, int ntrans, int niter)
+  {
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  sharp_alm_info *alms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+  reduce_alm_info(alms);
+
+  sharp_job job;
+  sharpd_build_job(&job,MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+  sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
+  unsigned long long opcnt=totalops(job.opcnt);
+  double timer=maxTime(job.time);
+  if (mytask==0) printf("wall time for map2alm: %fs\n",timer);
+  if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
+  measure_errors(alm_orig,alm,alms,ncomp);
+
+  for (int iter=0; iter<niter; ++iter)
+    {
+    double **map2;
+    ALLOC2D(map2,double,ncomp,npix);
+    if (mytask==0) printf ("\niteration %i:\n", iter+1);
+    sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map2[0],tinfo,alms,ntrans);
+    sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
+    opcnt=totalops(job.opcnt);
+    timer=maxTime(job.time);
+    if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
+    if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
+    for (int i=0; i<ncomp; ++i)
+      for (ptrdiff_t m=0; m<npix; ++m)
+        map2[i][m] = map[i][m]-map2[i][m];
+
+    sharpd_build_job(&job,MAP2ALM,spin,1,&alm[0],&map2[0],tinfo,alms,ntrans);
+    sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
+    opcnt=totalops(job.opcnt);
+    timer=maxTime(job.time);
+    if (mytask==0) printf("wall time for map2alm: %fs\n",wallTime()-timer);
+    if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
+    DEALLOC2D(map2);
+    measure_errors(alm_orig,alm,alms,ncomp);
+    }
+
+  sharp_destroy_alm_info(alms);
+  }
+
+static void check_accuracy (sharp_geom_info *tinfo, ptrdiff_t lmax,
+  ptrdiff_t mmax, ptrdiff_t npix, int spin, int ntrans, int niter)
+  {
+  int ncomp = ntrans*((spin==0) ? 1 : 2);
+
+  double **map;
+  ALLOC2D(map,double,ncomp,npix);
+
+  sharp_alm_info *alms;
+  ptrdiff_t nalms;
+  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
+  reduce_alm_info(alms);
+  nalms=get_nalms(alms);
+
+  dcmplx **alm;
+  ALLOC2D(alm,dcmplx,ncomp,nalms);
+  srand(4);
+  for (int i=0; i<ncomp; ++i)
+    random_alm(alm[i],alms,spin);
+
+  dcmplx **alm2;
+  ALLOC2D(alm2,dcmplx,ncomp,nalms);
+
+  if (mytask==0) printf ("\niteration 0:\n");
+  sharp_job job;
+  sharpd_build_job(&job,ALM2MAP,spin,0,&alm[0],&map[0],tinfo,alms,ntrans);
+  sharp_execute_job_mpi(&job,MPI_COMM_WORLD);
+  unsigned long long opcnt=totalops(job.opcnt);
+  double timer=maxTime(job.time);
+  if (mytask==0) printf("wall time for alm2map: %fs\n",timer);
+  if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer);
+
+  map2alm_iter(tinfo, map, alm, alm2, lmax, mmax, npix, spin, ntrans, niter);
+
+  DEALLOC2D(map);
+  DEALLOC2D(alm);
+  DEALLOC2D(alm2);
+
+  sharp_destroy_alm_info(alms);
+  }
+
+int main(int argc, char **argv)
+  {
+  MPI_Init(NULL,NULL);
+  MPI_Comm_size(MPI_COMM_WORLD,&ntasks);
+  MPI_Comm_rank(MPI_COMM_WORLD,&mytask);
+
+  module_startup_c("sharp_test_mpi",argc,7,
+    "<healpix|ecp|gauss> <lmax> <nside|nphi> <niter> <spin> <ntrans>",
+    mytask==0);
+  int lmax=atoi(argv[2]);
+  int niter=atoi(argv[4]);
+  int spin=atoi(argv[5]);
+  int ntrans=atoi(argv[6]);
+
+  if (mytask==0)
+    {
+    printf("Testing map analysis accuracy.\n");
+    printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin);
+    }
+
+  sharp_geom_info *tinfo;
+  if (strcmp(argv[1],"gauss")==0)
+    {
+    int nrings=lmax+1;
+    int ppring=atoi(argv[3]);
+    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+    if (mytask==0)
+      printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n",
+             nrings,ppring,(long)npix);
+    sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo);
+    reduce_geom_info(tinfo);
+    npix=get_npix(tinfo);
+    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
+    sharp_destroy_geom_info(tinfo);
+    }
+  else if (strcmp(argv[1],"ecp")==0)
+    {
+    int nrings=2*lmax+2;
+    int ppring=atoi(argv[3]);
+    ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
+    if (mytask==0)
+      printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n",
+             nrings,ppring,(long)npix);
+    sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
+    reduce_geom_info(tinfo);
+    npix=get_npix(tinfo);
+    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
+    sharp_destroy_geom_info(tinfo);
+    }
+  else if (strcmp(argv[1],"healpix")==0)
+    {
+    int nside=atoi(argv[3]);
+    if (nside<1) nside=1;
+    ptrdiff_t npix=12*(ptrdiff_t)nside*nside;
+    if (mytask==0)
+      printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n",
+             nside,(long)npix);
+    sharp_make_healpix_geom_info (nside, 1, &tinfo);
+    reduce_geom_info(tinfo);
+    npix=get_npix(tinfo);
+    check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter);
+    sharp_destroy_geom_info(tinfo);
+    }
+  else
+    UTIL_FAIL("unknown grid geometry");
+
+  MPI_Finalize();
+  return 0;
+  }
+
+#else
+
+#include "c_utils.h"
+
+int main(void)
+  { UTIL_FAIL("MPI support not enabled."); return 1; }
+
+#endif
diff --git a/libsharp/vecsupport.h b/libsharp/vecsupport.h
new file mode 100644
index 0000000..ccb9364
--- /dev/null
+++ b/libsharp/vecsupport.h
@@ -0,0 +1,158 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*  \file vecsupport.h
+ *  Convenience functions for vector arithmetics
+ *
+ *  Copyright (C) 2012 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#ifndef VECSUPPORT_H
+#define VECSUPPORT_H
+
+#include <math.h>
+#include "vec_utils.h"
+
+typedef double Ts;
+
+#if (VLEN==1)
+
+typedef double Tv;
+
+#define vadd(a,b) ((a)+(b))
+#define vaddeq(a,b) ((a)+=(b))
+#define vsub(a,b) ((a)-(b))
+#define vsubeq(a,b) ((a)-=(b))
+#define vmul(a,b) ((a)*(b))
+#define vmuleq(a,b) ((a)*=(b))
+#define vfmaeq(a,b,c) ((a)+=(b)*(c))
+#define vfmseq(a,b,c) ((a)-=(b)*(c))
+#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
+#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
+#define vneg(a) (-(a))
+#define vload(a) (a)
+#define vabs(a) fabs(a)
+#define vsqrt(a) sqrt(a)
+#define vlt(a,b) (((a)<(b))?1.:0.)
+#define vgt(a,b) (((a)>(b))?1.:0.)
+#define vne(a,b) (((a)!=(b))?1.:0.)
+#define vand(a,b) ((((a)*(b))!=0.)?1.:0.)
+
+static inline Tv vmin (Tv a, Tv b) { return (a<b) ? a : b; }
+static inline Tv vmax (Tv a, Tv b) { return (a>b) ? a : b; }
+
+#define vanyTrue(a) ((a)!=0.)
+#define vallTrue(a) ((a)!=0.)
+#define vblend(m,a,b) (((m)!=0.) ? (a) : (b))
+#define vzero 0.
+#define vone 1.
+
+#endif
+
+#if (VLEN==2)
+
+#include <emmintrin.h>
+
+#if defined (__SSE3__)
+#include <pmmintrin.h>
+#endif
+#if defined (__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+typedef __m128d Tv;
+
+#define vadd(a,b) _mm_add_pd(a,b)
+#define vaddeq(a,b) a=_mm_add_pd(a,b)
+#define vsub(a,b) _mm_sub_pd(a,b)
+#define vsubeq(a,b) a=_mm_sub_pd(a,b)
+#define vmul(a,b) _mm_mul_pd(a,b)
+#define vmuleq(a,b) a=_mm_mul_pd(a,b)
+#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
+#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
+#define vfmaaeq(a,b,c,d,e) \
+  a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
+#define vfmaseq(a,b,c,d,e) \
+  a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
+#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
+#define vload(a) _mm_set1_pd(a)
+#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
+#define vsqrt(a) _mm_sqrt_pd(a)
+#define vlt(a,b) _mm_cmplt_pd(a,b)
+#define vgt(a,b) _mm_cmpgt_pd(a,b)
+#define vne(a,b) _mm_cmpneq_pd(a,b)
+#define vand(a,b) _mm_and_pd(a,b)
+#define vmin(a,b) _mm_min_pd(a,b)
+#define vmax(a,b) _mm_max_pd(a,b);
+#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
+#define vallTrue(a) (_mm_movemask_pd(a)==3)
+#if defined(__SSE4_1__)
+#define vblend(m,a,b) _mm_blendv_pd(b,a,m)
+#else
+static inline Tv vblend(Tv m, Tv a, Tv b)
+  { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
+#endif
+#define vzero _mm_setzero_pd()
+#define vone _mm_set1_pd(1.)
+
+#endif
+
+#if (VLEN==4)
+
+#include <immintrin.h>
+
+typedef __m256d Tv;
+
+#define vadd(a,b) _mm256_add_pd(a,b)
+#define vaddeq(a,b) a=_mm256_add_pd(a,b)
+#define vsub(a,b) _mm256_sub_pd(a,b)
+#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
+#define vmul(a,b) _mm256_mul_pd(a,b)
+#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
+#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
+#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
+#define vfmaaeq(a,b,c,d,e) \
+  a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
+#define vfmaseq(a,b,c,d,e) \
+  a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
+#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
+#define vload(a) _mm256_set1_pd(a)
+#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
+#define vsqrt(a) _mm256_sqrt_pd(a)
+#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
+#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
+#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
+#define vand(a,b) _mm256_and_pd(a,b)
+#define vmin(a,b) _mm256_min_pd(a,b)
+#define vmax(a,b) _mm256_max_pd(a,b)
+#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
+#define vallTrue(a) (_mm256_movemask_pd(a)==15)
+#define vblend(m,a,b) _mm256_blendv_pd(b,a,m)
+#define vzero _mm256_setzero_pd()
+#define vone _mm256_set1_pd(1.)
+
+#endif
+
+#endif
diff --git a/libsharp/ylmgen_c.c b/libsharp/ylmgen_c.c
new file mode 100644
index 0000000..e674e63
--- /dev/null
+++ b/libsharp/ylmgen_c.c
@@ -0,0 +1,206 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  Helper code for efficient calculation of Y_lm(theta,phi=0)
+ *
+ *  Copyright (C) 2005-2012 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include "ylmgen_c.h"
+#include "c_utils.h"
+
+void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin)
+  {
+  const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220;
+
+  gen->lmax = l_max;
+  gen->mmax = m_max;
+  UTIL_ASSERT(spin>=0,"incorrect spin");
+  gen->s = spin;
+  UTIL_ASSERT((minscale<=0)&&(maxscale>0),"bad value for min/maxscale");
+  gen->cf=RALLOC(double,maxscale-minscale+1);
+  gen->cf[-minscale]=1.;
+  for (int m=-minscale-1; m>=0; --m)
+    gen->cf[m]=gen->cf[m+1]*fsmall;
+  for (int m=-minscale+1; m<(maxscale-minscale+1); ++m)
+    gen->cf[m]=gen->cf[m-1]*fbig;
+
+  gen->m = -1;
+  if (spin==0)
+    {
+    gen->rf = RALLOC(ylmgen_dbl2,gen->lmax+1);
+    gen->mfac = RALLOC(double,gen->mmax+1);
+    gen->mfac[0] = inv_sqrt4pi;
+    for (int m=1; m<=gen->mmax; ++m)
+      gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m));
+    gen->root = RALLOC(double,2*gen->lmax+5);
+    gen->iroot = RALLOC(double,2*gen->lmax+5);
+    for (int m=0; m<2*gen->lmax+5; ++m)
+      {
+      gen->root[m] = sqrt(m);
+      gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m];
+      }
+    }
+  else
+    {
+    gen->m=gen->mlo=gen->mhi=-1234567890;
+    ALLOC(gen->fx,ylmgen_dbl3,gen->lmax+2);
+    for (int m=0; m<gen->lmax+2; ++m)
+      gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.;
+    ALLOC(gen->inv,double,gen->lmax+1);
+    gen->inv[0]=0;
+    for (int m=1; m<gen->lmax+1; ++m) gen->inv[m]=1./m;
+    ALLOC(gen->flm1,double,2*gen->lmax+1);
+    ALLOC(gen->flm2,double,2*gen->lmax+1);
+    for (int m=0; m<2*gen->lmax+1; ++m)
+      {
+      gen->flm1[m] = sqrt(1./(m+1.));
+      gen->flm2[m] = sqrt(m/(m+1.));
+      }
+    ALLOC(gen->prefac,double,gen->mmax+1);
+    ALLOC(gen->fscale,int,gen->mmax+1);
+    double *fac = RALLOC(double,2*gen->lmax+1);
+    int *facscale = RALLOC(int,2*gen->lmax+1);
+    fac[0]=1; facscale[0]=0;
+    for (int m=1; m<2*gen->lmax+1; ++m)
+      {
+      fac[m]=fac[m-1]*sqrt(m);
+      facscale[m]=facscale[m-1];
+      if (fac[m]>1.) { fac[m]*=fsmall; ++facscale[m]; }
+      }
+    for (int m=0; m<=gen->mmax; ++m)
+      {
+      int mlo=gen->s, mhi=m;
+      if (mhi<mlo) SWAP(mhi,mlo,int);
+      gen->prefac[m]=fac[2*mhi]/(fac[mhi+mlo]*fac[mhi-mlo]);
+      gen->fscale[m]=facscale[2*mhi]-facscale[mhi+mlo]-facscale[mhi-mlo];
+      }
+    DEALLOC(fac);
+    DEALLOC(facscale);
+    }
+  }
+
+void Ylmgen_destroy (Ylmgen_C *gen)
+  {
+  DEALLOC(gen->cf);
+  if (gen->s==0)
+    {
+    DEALLOC(gen->rf);
+    DEALLOC(gen->mfac);
+    DEALLOC(gen->root);
+    DEALLOC(gen->iroot);
+    }
+  else
+    {
+    DEALLOC(gen->fx);
+    DEALLOC(gen->prefac);
+    DEALLOC(gen->fscale);
+    DEALLOC(gen->flm1);
+    DEALLOC(gen->flm2);
+    DEALLOC(gen->inv);
+    }
+  }
+
+void Ylmgen_prepare (Ylmgen_C *gen, int m)
+  {
+  if (m==gen->m) return;
+  UTIL_ASSERT(m>=0,"incorrect m");
+  gen->m = m;
+
+  if (gen->s==0)
+    {
+    gen->rf[m].f[0] = gen->root[2*m+3];
+    gen->rf[m].f[1] = 0.;
+    for (int l=m+1; l<=gen->lmax; ++l)
+      {
+      double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m];
+      gen->rf[l].f[0] = tmp*gen->root[2*l+1];
+      gen->rf[l].f[1] = tmp*gen->root[l+m]*gen->root[l-m]*gen->iroot[2*l-1];
+      }
+    }
+  else
+    {
+    int mlo_=m, mhi_=gen->s;
+    if (mhi_<mlo_) SWAP(mhi_,mlo_,int);
+    int ms_similar = ((gen->mhi==mhi_) && (gen->mlo==mlo_));
+
+    gen->mlo = mlo_; gen->mhi = mhi_;
+
+    if (!ms_similar)
+      {
+      for (int l=gen->mhi; l<gen->lmax; ++l)
+        {
+        double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m]
+                  *gen->flm1[l+gen->s]*gen->flm1[l-gen->s];
+        double lt = 2*l+1;
+        double l1 = l+1;
+        gen->fx[l+1].f[0]=l1*lt*t;
+        gen->fx[l+1].f[1]=gen->m*gen->s*gen->inv[l]*gen->inv[l+1];
+        t = gen->flm2[l+gen->m]*gen->flm2[l-gen->m]
+           *gen->flm2[l+gen->s]*gen->flm2[l-gen->s];
+        gen->fx[l+1].f[2]=t*l1*gen->inv[l];
+        }
+      }
+
+    gen->preMinus_p = gen->preMinus_m = 0;
+    if (gen->mhi==gen->m)
+      {
+      gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s;
+      gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1);
+      }
+    else
+      {
+      gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m;
+      gen->preMinus_m = ((gen->mhi+gen->m)&1);
+      }
+    }
+  }
+
+double *Ylmgen_get_norm (int lmax, int spin)
+  {
+  const double pi = 3.141592653589793238462643383279502884197;
+  double *res=RALLOC(double,lmax+1);
+  /* sign convention for H=1 (LensPix paper) */
+#if 1
+   double spinsign = (spin>0) ? -1.0 : 1.0;
+#else
+   double spinsign = 1.0;
+#endif
+
+  if (spin==0)
+    {
+    for (int l=0; l<=lmax; ++l)
+      res[l]=1.;
+    return res;
+    }
+
+  spinsign = (spin&1) ? -spinsign : spinsign;
+  for (int l=0; l<=lmax; ++l)
+    res[l] = (l<spin) ? 0. : spinsign*0.5*sqrt((2*l+1)/(4*pi));
+  return res;
+  }
diff --git a/libsharp/ylmgen_c.h b/libsharp/ylmgen_c.h
new file mode 100644
index 0000000..6462478
--- /dev/null
+++ b/libsharp/ylmgen_c.h
@@ -0,0 +1,92 @@
+/*
+ *  This file is part of libsharp.
+ *
+ *  libsharp is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libsharp is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libsharp; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*! \file ylmgen_c.h
+ *  Code for efficient calculation of Y_lm(phi=0,theta)
+ *
+ *  Copyright (C) 2005-2012 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef PLANCK_YLMGEN_C_H
+#define PLANCK_YLMGEN_C_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum { minscale=-8, limscale=-3, maxscale=5 };
+static const double fbig=0x1p+90,fsmall=0x1p-90;
+
+typedef struct { double f[2]; } ylmgen_dbl2;
+typedef struct { double f[3]; } ylmgen_dbl3;
+
+typedef struct
+  {
+/* for public use; immutable during lifetime */
+  int lmax, mmax, s;
+  double *cf;
+
+/* for public use; will typically change after call to Ylmgen_prepare() */
+  int m;
+
+/* used if s==0 */
+  double *mfac;
+  ylmgen_dbl2 *rf;
+
+/* used if s!=0 */
+  int sinPow, cosPow, preMinus_p, preMinus_m;
+  double *prefac;
+  int *fscale;
+  ylmgen_dbl3 *fx;
+
+/* internal usage only */
+/* used if s==0 */
+  double *root, *iroot;
+
+/* used if s!=0 */
+  double *flm1, *flm2, *inv;
+  int mlo, mhi;
+  } Ylmgen_C;
+
+/*! Creates a generator which will calculate helper data for Y_lm calculation
+    up to \a l=l_max and \a m=m_max. */
+void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin);
+
+/*! Deallocates a generator previously initialised by Ylmgen_init(). */
+void Ylmgen_destroy (Ylmgen_C *gen);
+
+/*! Prepares the object for the calculation at \a m. */
+void Ylmgen_prepare (Ylmgen_C *gen, int m);
+
+/*! Returns a pointer to an array with \a lmax+1 entries containing
+    normalisation factors that must be applied to Y_lm values computed for
+    \a spin. The array must be deallocated (using free()) by the user. */
+double *Ylmgen_get_norm (int lmax, int spin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif