diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e6c51bc --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +SHARP_TARGET?=auto +ifndef SHARP_TARGET + SHARP_TARGET:=$(error SHARP_TARGET undefined. Please see README.compilation for help)UNDEFINED +endif + +default: compile_all +SRCROOT:=$(shell pwd) +include $(SRCROOT)/config/config.$(SHARP_TARGET) +include $(SRCROOT)/config/rules.common + +all_hdr:= +all_lib:= +all_cbin:= + +FULL_INCLUDE:= + +include c_utils/planck.make +include libfftpack/planck.make +include libsharp/planck.make +include docsrc/planck.make + +$(all_lib): %: | $(LIBDIR)_mkdir + @echo "# creating library $*" + $(ARCREATE) $@ $^ + +$(all_cbin): %: | $(BINDIR)_mkdir + @echo "# linking C binary $*" + $(CL) -o $@ $^ $(CLFLAGS) +# $(CXX) -o $@ $^ $(CLFLAGS) + +compile_all: $(all_cbin) hdrcopy + +autotune: sharp_bench + $(BINDIR)/sharp_bench + mv oracle.inc $(SRCROOT)/libsharp + $(MAKE) + +hdrclean: + @if [ -d $(INCDIR) ]; then rm -rf $(INCDIR)/* ; fi + +hdrcopy: | $(INCDIR)_mkdir + @if [ "$(all_hdr)" ]; then cp -p $(all_hdr) $(INCDIR); fi + +$(notdir $(all_cbin)) : % : $(BINDIR)/% + +test: compile_all + $(BINDIR)/sharp_acctest && \ + $(BINDIR)/sharp_test healpix 2048 1024 1 0 1 && \ + $(BINDIR)/sharp_test ecp 2047 4096 0 2 1 && \ + $(BINDIR)/sharp_test gauss 2047 4096 0 0 2 diff --git a/README.compilation b/README.compilation new file mode 100644 index 0000000..7607750 --- /dev/null +++ b/README.compilation @@ -0,0 +1,16 @@ +GNU make and GNU gcc (version 4.x) are required for compilation. + +Simply run "./configure"; if this fails, please refer to the output of +"./configure --help" for additional hints and, if necessary, provide +additional flags to the configure script. +Once the script finishes successfully, run "make autotune" +(or "gmake autotune"). This should perform some necessary self-tuning and +install the compilation products in the subdirectory "auto/". +NOTE: Autotuning should be done on the the computer where you wish to use +the library later on, and no other CPU-intensive tasks should be running +during the autotuning process. + +Documentation can be created by the command "(g)make doc". +However this requires the doxygen application to be installed +on your system. +The documentation will be created in the subdirectory doc/. diff --git a/c_utils/c_utils.c b/c_utils/c_utils.c new file mode 100644 index 0000000..d8601e7 --- /dev/null +++ b/c_utils/c_utils.c @@ -0,0 +1,145 @@ +/* + * This file is part of libc_utils. + * + * libc_utils is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libc_utils is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libc_utils; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + * Convenience functions + * + * Copyright (C) 2008, 2009, 2010, 2011 Max-Planck-Society + * Author: Martin Reinecke + */ + +#include +#include +#include +#include "c_utils.h" +#include "vec_utils.h" +#ifdef _OPENMP +#include +#endif +#ifdef USE_MPI +#include +#endif + +void util_fail_ (const char *file, int line, const char *func, const char *msg) + { + fprintf(stderr,"%s, %i (%s):\n%s\n",file,line,func,msg); + exit(1); + } +void util_warn_ (const char *file, int line, const char *func, const char *msg) + { + fprintf(stderr,"%s, %i (%s):\n%s\n",file,line,func,msg); + } + +/* This function tries to avoid allocations with a total size close to a high + power of two (called the "critical stride" here), by adding a few more bytes + if necssary. This lowers the probability that two arrays differ by a multiple + of the critical stride in their starting address, which in turn lowers the + risk of cache line contention. */ +static size_t manipsize(size_t sz) + { + const size_t critical_stride=4096, cacheline=64, overhead=32; + if (sz < (critical_stride/2)) return sz; + if (((sz+overhead)%critical_stride)>(2*cacheline)) return sz; + return sz+2*cacheline; + } + +#ifdef __SSE__ +#include +void *util_malloc_ (size_t sz) + { + void *res; + if (sz==0) return NULL; + res = _mm_malloc(manipsize(sz),16); + UTIL_ASSERT(res,"_mm_malloc() failed"); + return res; + } +void util_free_ (void *ptr) + { if ((ptr)!=NULL) _mm_free(ptr); } +#else +void *util_malloc_ (size_t sz) + { + void *res; + if (sz==0) return NULL; + res = malloc(manipsize(sz)); + UTIL_ASSERT(res,"malloc() failed"); + return res; + } +void util_free_ (void *ptr) + { if ((ptr)!=NULL) free(ptr); } +#endif + +static void OpenMP_status(void) + { +#ifndef _OPENMP + printf("OpenMP: not supported by this binary\n"); +#else + int threads = omp_get_max_threads(); + if (threads>1) + printf("OpenMP active: max. %d threads.\n",threads); + else + printf("OpenMP active, but running with 1 thread only.\n"); +#endif + } + +static void MPI_status(void) + { +#ifndef USE_MPI + printf("MPI: not supported by this binary\n"); +#else + int tasks; + MPI_Comm_size(MPI_COMM_WORLD,&tasks); + if (tasks>1) + printf("MPI active with %d tasks.\n",tasks); + else + printf("MPI active, but running with 1 task only.\n"); +#endif + } + +static void vecmath_status(void) + { printf("Supported vector length: %d\n",VLEN); } + +void announce_c (const char *name) + { + size_t m, nlen=strlen(name); + printf("\n+-"); + for (m=0; m +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void util_fail_ (const char *file, int line, const char *func, const char *msg); +void util_warn_ (const char *file, int line, const char *func, const char *msg); +void *util_malloc_ (size_t sz); +void util_free_ (void *ptr); + +void announce_c (const char *name); +void module_startup_c (const char *name, int argc, int argc_expected, + const char *argv_expected, int verbose); + +#if defined (__GNUC__) +#define UTIL_FUNC_NAME__ __func__ +#else +#define UTIL_FUNC_NAME__ "unknown" +#endif + +/*! \def UTIL_ASSERT(cond,msg) + If \a cond is false, print an error message containing function name, + source file name and line number of the call, as well as \a msg; + then exit the program with an error status. */ +#define UTIL_ASSERT(cond,msg) \ + if(!(cond)) util_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg) +/*! \def UTIL_WARN(cond,msg) + If \a cond is false, print an warning containing function name, + source file name and line number of the call, as well as \a msg. */ +#define UTIL_WARN(cond,msg) \ + if(!(cond)) util_warn_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg) +/*! \def UTIL_FAIL(msg) + Print an error message containing function name, + source file name and line number of the call, as well as \a msg; + then exit the program with an error status. */ +#define UTIL_FAIL(msg) \ + util_fail_(__FILE__,__LINE__,UTIL_FUNC_NAME__,msg) + +/*! \def ALLOC(ptr,type,num) + Allocate space for \a num objects of type \a type. Make sure that the + allocation succeeded, else stop the program with an error. Return the + resulting pointer in \a ptr. */ +#define ALLOC(ptr,type,num) \ + do { (ptr)=(type *)util_malloc_((num)*sizeof(type)); } while (0) +/*! \def RALLOC(type,num) + Allocate space for \a num objects of type \a type. Make sure that the + allocation succeeded, else stop the program with an error. Cast the + resulting pointer to \a (type*). */ +#define RALLOC(type,num) \ + ((type *)util_malloc_((num)*sizeof(type))) +/*! \def DEALLOC(ptr) + Deallocate \a ptr. It must have been allocated using \a ALLOC or + \a RALLOC. */ +#define DEALLOC(ptr) \ + do { util_free_(ptr); (ptr)=NULL; } while(0) +#define RESIZE(ptr,type,num) \ + do { util_free_(ptr); ALLOC(ptr,type,num); } while(0) +#define GROW(ptr,type,sz_old,sz_new) \ + do { \ + if ((sz_new)>(sz_old)) \ + { RESIZE(ptr,type,2*(sz_new));sz_old=2*(sz_new); } \ + } while(0) +/*! \def SET_ARRAY(ptr,i1,i2,val) + Set the entries \a ptr[i1] ... \a ptr[i2-1] to \a val. */ +#define SET_ARRAY(ptr,i1,i2,val) \ + do { \ + ptrdiff_t cnt_; \ + for (cnt_=(i1);cnt_<(i2);++cnt_) (ptr)[cnt_]=(val); \ + } while(0) +/*! \def COPY_ARRAY(src,dest,i1,i2) + Copy the entries \a src[i1] ... \a src[i2-1] to + \a dest[i1] ... \a dest[i2-1]. */ +#define COPY_ARRAY(src,dest,i1,i2) \ + do { \ + ptrdiff_t cnt_; \ + for (cnt_=(i1);cnt_<(i2);++cnt_) (dest)[cnt_]=(src)[cnt_]; \ + } while(0) + +#define ALLOC2D(ptr,type,num1,num2) \ + do { \ + size_t cnt_, num1_=(num1), num2_=(num2); \ + ALLOC(ptr,type *,num1_); \ + ALLOC(ptr[0],type,num1_*num2_); \ + for (cnt_=1; cnt_(b)) ? (a) : (b)) +#define IMIN(a,b) \ + (((a)<(b)) ? (a) : (b)) + +#define SWAP(a,b,type) \ + do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0) + +#define CHECK_STACK_ALIGN(align) \ + do { \ + double foo; \ + UTIL_WARN((((size_t)(&foo))&(align-1))==0, \ + "WARNING: stack not sufficiently aligned!"); \ + } while(0) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/c_utils/planck.make b/c_utils/planck.make new file mode 100644 index 0000000..f4a2d30 --- /dev/null +++ b/c_utils/planck.make @@ -0,0 +1,18 @@ +PKG:=c_utils + +SD:=$(SRCROOT)/$(PKG) +OD:=$(BLDROOT)/$(PKG) + +FULL_INCLUDE+= -I$(SD) + +HDR_$(PKG):=$(SD)/*.h +LIB_$(PKG):=$(LIBDIR)/libc_utils.a + +OBJ:=c_utils.o walltime_c.o +OBJ:=$(OBJ:%=$(OD)/%) + +$(OBJ): $(HDR_$(PKG)) | $(OD)_mkdir +$(LIB_$(PKG)): $(OBJ) + +all_hdr+=$(HDR_$(PKG)) +all_lib+=$(LIB_$(PKG)) diff --git a/c_utils/vec_utils.h b/c_utils/vec_utils.h new file mode 100644 index 0000000..50066f8 --- /dev/null +++ b/c_utils/vec_utils.h @@ -0,0 +1,43 @@ +/* + * This file is part of libc_utils. + * + * libc_utils is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libc_utils is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libc_utils; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file vec_utils.h + * Functionality related to vector instruction support + * + * Copyright (C) 2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_VEC_UTILS_H +#define PLANCK_VEC_UTILS_H + +#if (defined (__AVX__) && (!defined (DISABLE_AVX)) && (!defined (DISABLE_SSE2))) +#define VLEN 4 +#elif (defined (__SSE2__) && (!defined (DISABLE_SSE2))) +#define VLEN 2 +#else +#define VLEN 1 +#endif + +#endif diff --git a/c_utils/walltime_c.c b/c_utils/walltime_c.c new file mode 100644 index 0000000..c9dce3a --- /dev/null +++ b/c_utils/walltime_c.c @@ -0,0 +1,54 @@ +/* + * This file is part of libc_utils. + * + * libc_utils is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libc_utils is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libc_utils; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + * Functionality for reading wall clock time + * + * Copyright (C) 2010, 2011 Max-Planck-Society + * Author: Martin Reinecke + */ + +#if defined (_OPENMP) +#include +#elif defined (USE_MPI) +#include "mpi.h" +#else +#include +#include +#endif + +#include "walltime_c.h" + +double wallTime(void) + { +#if defined (_OPENMP) + return omp_get_wtime(); +#elif defined (USE_MPI) + return MPI_Wtime(); +#else + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec + 1e-6*t.tv_usec; +#endif + } diff --git a/c_utils/walltime_c.h b/c_utils/walltime_c.h new file mode 100644 index 0000000..ea9d2a2 --- /dev/null +++ b/c_utils/walltime_c.h @@ -0,0 +1,53 @@ +/* + * This file is part of libc_utils. + * + * libc_utils is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libc_utils is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libc_utils; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libc_utils is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file walltime_c.h + * Functionality for reading wall clock time + * + * Copyright (C) 2010 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_WALLTIME_C_H +#define PLANCK_WALLTIME_C_H + +#ifdef __cplusplus +extern "C" { +#endif + +/*! Returns an approximation of the current wall time (in seconds). + The first available of the following timers will be used: +
    +
  • \a omp_get_wtime(), if OpenMP is available +
  • \a MPI_Wtime(), if MPI is available +
  • \a gettimeofday() otherwise +
+ \note Only useful for measuring time differences. */ +double wallTime(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/config/config.auto.in b/config/config.auto.in new file mode 100644 index 0000000..32b340b --- /dev/null +++ b/config/config.auto.in @@ -0,0 +1,9 @@ +@SILENT_RULE@ + +CC=@CC@ +CL=@CC@ +CCFLAGS_NO_C=@CCFLAGS_NO_C@ +CCFLAGS=$(CCFLAGS_NO_C) -c +CLFLAGS=-L. -L$(LIBDIR) @LDCCFLAGS@ -lm + +ARCREATE=@ARCREATE@ diff --git a/config/rules.common b/config/rules.common new file mode 100644 index 0000000..419584d --- /dev/null +++ b/config/rules.common @@ -0,0 +1,31 @@ +BLDROOT = $(SRCROOT)/build.$(SHARP_TARGET) +PREFIX = $(SRCROOT)/$(SHARP_TARGET) +BINDIR = $(PREFIX)/bin +INCDIR = $(PREFIX)/include +LIBDIR = $(PREFIX)/lib +DOCDIR = $(SRCROOT)/doc + +# do not use any suffix rules +.SUFFIXES: +# do not use any default rules +.DEFAULT: + +echo_config: + @echo using configuration \'$(SHARP_TARGET)\' + +$(BLDROOT)/%.o : $(SRCROOT)/%.c | echo_config + @echo "# compiling $*.c" + cd $(@D) && $(CC) $(FULL_INCLUDE) -I$(BLDROOT) $(CCFLAGS) $< + +$(BLDROOT)/%.o : $(SRCROOT)/%.cc | echo_config + @echo "# compiling $*.cc" + cd $(@D) && $(CXX) $(FULL_INCLUDE) -I$(BLDROOT) $(CXXCFLAGS) $< + +%_mkdir: + @if [ ! -d $* ]; then mkdir -p $* ; fi + +clean: + rm -rf $(BLDROOT) $(PREFIX) $(DOCDIR) autom4te.cache/ config.log config.status + +distclean: clean + rm -f config/config.auto diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..cb3ddc6 --- /dev/null +++ b/configure.ac @@ -0,0 +1,123 @@ +AC_INIT(config/config.auto.in) + +AC_CHECK_PROG([uname_found],[uname],[1],[0]) +if test $uname_found -eq 0 ; then + echo "No uname found; setting system type to unknown." + system="unknown" +else + system=`uname -s`-`uname -r` +fi +AC_LANG([C]) + +AC_TRY_COMPILE([], [@%:@ifndef __INTEL_COMPILER +choke me +@%:@endif], [ICC=[yes]], [ICC=[no]]) + +if test $ICC = yes; then GCC=no; fi +CCTYPE=unknown +if test $GCC = yes; then CCTYPE=gcc; fi +if test $ICC = yes; then CCTYPE=icc; fi +AC_OPENMP + +SILENT_RULE=".SILENT:" +AC_ARG_ENABLE(noisy-make, + [ --enable-noisy-make enable detailed make output], + [if test "$enableval" = yes; then + SILENT_RULE="" + fi]) + +ENABLE_MPI=no +AC_ARG_ENABLE(mpi, + [ --enable-mpi enable generation of MPI-parallel code], + [if test "$enableval" = yes; then + ENABLE_MPI=yes + fi]) + +ENABLE_DEBUG=no +AC_ARG_ENABLE(debug, + [ --enable-debug enable generation of debugging symbols], + [if test "$enableval" = yes; then + ENABLE_DEBUG=yes + fi]) + +ENABLE_SSE2=yes +AC_ARG_ENABLE(sse2, + [ --disable-sse2 disable generation of SSE2 instructions], + [if test "$enableval" = no; then + ENABLE_SSE2=no + fi]) + +ENABLE_AVX=yes +AC_ARG_ENABLE(avx, + [ --disable-avx disable generation of AVX instructions], + [if test "$enableval" = no; then + ENABLE_AVX=no + fi]) + +case $CCTYPE in + gcc) + CCFLAGS="-O3 -fno-tree-vectorize -ffast-math -fomit-frame-pointer -std=c99 -pedantic -Wextra -Wall -Wno-unknown-pragmas -Wshadow -Wmissing-prototypes -Wfatal-errors" + GCCVERSION="`$CC -dumpversion 2>&1`" + echo "Using gcc version $GCCVERSION" + AC_SUBST(GCCVERSION) + case $system in + Darwin-*) + ;; + *) + CCFLAGS="$CCFLAGS -ffunction-sections -fdata-sections" + ;; + esac + changequote(,) + gcc43=`echo $GCCVERSION | grep -c '4\.[3456789]'` + changequote([,]) + if test $gcc43 -gt 0; then + CCFLAGS="$CCFLAGS -march=native" + fi + ;; + icc) + CCFLAGS="-O3 -xHOST -std=c99 -ip -Wbrief -Wall -vec-report0 -openmp-report0 -wd383,981,1419,1572" + ;; + *) + CCFLAGS="-O2" + # Don't do anything now + ;; +esac + +case $system in + Darwin-*) + ARCREATE="libtool -static -o" + ;; + *) + ARCREATE="ar cr" + ;; +esac + +CCFLAGS="$CCFLAGS $OPENMP_CFLAGS" + +if test $ENABLE_DEBUG = yes; then + CCFLAGS="$CCFLAGS -g" +fi + +if test $ENABLE_MPI = yes; then + CCFLAGS="$CCFLAGS -DUSE_MPI" +fi + +if test $ENABLE_SSE2 = no; then + CCFLAGS="$CCFLAGS -DDISABLE_SSE2" +fi + +if test $ENABLE_AVX = no; then + CCFLAGS="$CCFLAGS -DDISABLE_AVX" +fi + +CCFLAGS_NO_C="$CCFLAGS $CPPFLAGS" + +LDCCFLAGS="$LDFLAGS $CCFLAGS" + +AC_SUBST(SILENT_RULE) +AC_SUBST(CC) +AC_SUBST(CCFLAGS_NO_C) +AC_SUBST(LDCCFLAGS) +AC_SUBST(ARCREATE) + +AC_OUTPUT(config/config.auto) diff --git a/docsrc/c_utils.dox b/docsrc/c_utils.dox new file mode 100644 index 0000000..daf432f --- /dev/null +++ b/docsrc/c_utils.dox @@ -0,0 +1,290 @@ +# Doxyfile 1.8.1 + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = "LevelS C support library" +PROJECT_NUMBER = 0.1 +PROJECT_BRIEF = +PROJECT_LOGO = +OUTPUT_DIRECTORY = . +CREATE_SUBDIRS = NO +OUTPUT_LANGUAGE = English +BRIEF_MEMBER_DESC = NO +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = NO +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = NO +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 8 +ALIASES = +TCL_SUBST = +OPTIMIZE_OUTPUT_FOR_C = YES +OPTIMIZE_OUTPUT_JAVA = NO +OPTIMIZE_FOR_FORTRAN = NO +OPTIMIZE_OUTPUT_VHDL = NO +EXTENSION_MAPPING = +MARKDOWN_SUPPORT = YES +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +SIP_SUPPORT = NO +IDL_PROPERTY_SUPPORT = YES +DISTRIBUTE_GROUP_DOC = NO +SUBGROUPING = YES +INLINE_GROUPED_CLASSES = NO +INLINE_SIMPLE_STRUCTS = NO +TYPEDEF_HIDES_STRUCT = NO +SYMBOL_CACHE_SIZE = 0 +LOOKUP_CACHE_SIZE = 0 +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_PACKAGE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +HIDE_UNDOC_MEMBERS = YES +HIDE_UNDOC_CLASSES = YES +HIDE_FRIEND_COMPOUNDS = YES +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +SHOW_INCLUDE_FILES = YES +FORCE_LOCAL_INCLUDES = NO +INLINE_INFO = YES +SORT_MEMBER_DOCS = NO +SORT_BRIEF_DOCS = NO +SORT_MEMBERS_CTORS_1ST = NO +SORT_GROUP_NAMES = NO +SORT_BY_SCOPE_NAME = NO +STRICT_PROTO_MATCHING = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_FILES = YES +SHOW_NAMESPACES = YES +FILE_VERSION_FILTER = +LAYOUT_FILE = +CITE_BIB_FILES = +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- +QUIET = YES +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- +INPUT = ../c_utils +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.h \ + *.c \ + *.dox +RECURSIVE = YES +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = +EXAMPLE_PATH = +EXAMPLE_PATTERNS = +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +FILTER_SOURCE_PATTERNS = +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- +SOURCE_BROWSER = YES +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = NO +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +REFERENCES_LINK_SOURCE = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- +GENERATE_HTML = YES +HTML_OUTPUT = htmldoc +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = footer.html +HTML_STYLESHEET = +HTML_EXTRA_FILES = +HTML_COLORSTYLE_HUE = 220 +HTML_COLORSTYLE_SAT = 100 +HTML_COLORSTYLE_GAMMA = 80 +HTML_TIMESTAMP = YES +HTML_DYNAMIC_SECTIONS = NO +HTML_INDEX_NUM_ENTRIES = 100 +GENERATE_DOCSET = NO +DOCSET_FEEDNAME = "Doxygen generated docs" +DOCSET_BUNDLE_ID = org.doxygen.Project +DOCSET_PUBLISHER_ID = org.doxygen.Publisher +DOCSET_PUBLISHER_NAME = Publisher +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +CHM_INDEX_ENCODING = +BINARY_TOC = NO +TOC_EXPAND = NO +GENERATE_QHP = NO +QCH_FILE = +QHP_NAMESPACE = org.doxygen.Project +QHP_VIRTUAL_FOLDER = doc +QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = +QHG_LOCATION = +GENERATE_ECLIPSEHELP = NO +ECLIPSE_DOC_ID = org.doxygen.Project +DISABLE_INDEX = NO +GENERATE_TREEVIEW = NO +ENUM_VALUES_PER_LINE = 4 +TREEVIEW_WIDTH = 250 +EXT_LINKS_IN_WINDOW = NO +FORMULA_FONTSIZE = 10 +FORMULA_TRANSPARENT = YES +USE_MATHJAX = NO +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest +MATHJAX_EXTENSIONS = +SEARCHENGINE = NO +SERVER_BASED_SEARCH = NO +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = latex +MAKEINDEX_CMD_NAME = makeindex +COMPACT_LATEX = YES +PAPER_TYPE = a4wide +EXTRA_PACKAGES = +LATEX_HEADER = +LATEX_FOOTER = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +LATEX_SOURCE_CODE = NO +LATEX_BIB_STYLE = plain +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = NO +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_LINKS = NO +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- +GENERATE_XML = NO +XML_OUTPUT = xml +XML_SCHEMA = +XML_DTD = +XML_PROGRAMLISTING = YES +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- +GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- +TAGFILES = +GENERATE_TAGFILE = c_utils.tag +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +PERL_PATH = /usr/bin/perl +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- +CLASS_DIAGRAMS = YES +MSCGEN_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = NO +DOT_NUM_THREADS = 0 +DOT_FONTNAME = FreeSans +DOT_FONTSIZE = 10 +DOT_FONTPATH = +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = NO +UML_LIMIT_NUM_FIELDS = 10 +TEMPLATE_RELATIONS = YES +INCLUDE_GRAPH = NO +INCLUDED_BY_GRAPH = NO +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = NO +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +INTERACTIVE_SVG = NO +DOT_PATH = +DOTFILE_DIRS = +MSCFILE_DIRS = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES diff --git a/docsrc/footer.html b/docsrc/footer.html new file mode 100644 index 0000000..6f5dbf0 --- /dev/null +++ b/docsrc/footer.html @@ -0,0 +1,5 @@ +
+Generated on $datetime for $projectname +
+ + diff --git a/docsrc/index_code.html b/docsrc/index_code.html new file mode 100644 index 0000000..d8a001d --- /dev/null +++ b/docsrc/index_code.html @@ -0,0 +1,15 @@ + + +Libsharp source code documentation + +

Libsharp source code documentation

+ +

C interfaces

+ + + + diff --git a/docsrc/libfftpack.dox b/docsrc/libfftpack.dox new file mode 100644 index 0000000..7ff2c23 --- /dev/null +++ b/docsrc/libfftpack.dox @@ -0,0 +1,290 @@ +# Doxyfile 1.8.1 + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = "LevelS FFT library" +PROJECT_NUMBER = 0.1 +PROJECT_BRIEF = +PROJECT_LOGO = +OUTPUT_DIRECTORY = . +CREATE_SUBDIRS = NO +OUTPUT_LANGUAGE = English +BRIEF_MEMBER_DESC = NO +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = NO +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = NO +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 8 +ALIASES = +TCL_SUBST = +OPTIMIZE_OUTPUT_FOR_C = YES +OPTIMIZE_OUTPUT_JAVA = NO +OPTIMIZE_FOR_FORTRAN = NO +OPTIMIZE_OUTPUT_VHDL = NO +EXTENSION_MAPPING = +MARKDOWN_SUPPORT = YES +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +SIP_SUPPORT = NO +IDL_PROPERTY_SUPPORT = YES +DISTRIBUTE_GROUP_DOC = NO +SUBGROUPING = YES +INLINE_GROUPED_CLASSES = NO +INLINE_SIMPLE_STRUCTS = NO +TYPEDEF_HIDES_STRUCT = NO +SYMBOL_CACHE_SIZE = 0 +LOOKUP_CACHE_SIZE = 0 +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_PACKAGE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +HIDE_UNDOC_MEMBERS = YES +HIDE_UNDOC_CLASSES = YES +HIDE_FRIEND_COMPOUNDS = YES +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +SHOW_INCLUDE_FILES = YES +FORCE_LOCAL_INCLUDES = NO +INLINE_INFO = YES +SORT_MEMBER_DOCS = NO +SORT_BRIEF_DOCS = NO +SORT_MEMBERS_CTORS_1ST = NO +SORT_GROUP_NAMES = NO +SORT_BY_SCOPE_NAME = NO +STRICT_PROTO_MATCHING = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_FILES = YES +SHOW_NAMESPACES = YES +FILE_VERSION_FILTER = +LAYOUT_FILE = +CITE_BIB_FILES = +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- +QUIET = YES +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- +INPUT = ../libfftpack +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.h \ + *.c \ + *.dox +RECURSIVE = YES +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = +EXAMPLE_PATH = +EXAMPLE_PATTERNS = +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +FILTER_SOURCE_PATTERNS = +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- +SOURCE_BROWSER = YES +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = NO +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +REFERENCES_LINK_SOURCE = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- +GENERATE_HTML = YES +HTML_OUTPUT = htmldoc +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = footer.html +HTML_STYLESHEET = +HTML_EXTRA_FILES = +HTML_COLORSTYLE_HUE = 220 +HTML_COLORSTYLE_SAT = 100 +HTML_COLORSTYLE_GAMMA = 80 +HTML_TIMESTAMP = YES +HTML_DYNAMIC_SECTIONS = NO +HTML_INDEX_NUM_ENTRIES = 100 +GENERATE_DOCSET = NO +DOCSET_FEEDNAME = "Doxygen generated docs" +DOCSET_BUNDLE_ID = org.doxygen.Project +DOCSET_PUBLISHER_ID = org.doxygen.Publisher +DOCSET_PUBLISHER_NAME = Publisher +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +CHM_INDEX_ENCODING = +BINARY_TOC = NO +TOC_EXPAND = NO +GENERATE_QHP = NO +QCH_FILE = +QHP_NAMESPACE = org.doxygen.Project +QHP_VIRTUAL_FOLDER = doc +QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = +QHG_LOCATION = +GENERATE_ECLIPSEHELP = NO +ECLIPSE_DOC_ID = org.doxygen.Project +DISABLE_INDEX = NO +GENERATE_TREEVIEW = NO +ENUM_VALUES_PER_LINE = 4 +TREEVIEW_WIDTH = 250 +EXT_LINKS_IN_WINDOW = NO +FORMULA_FONTSIZE = 10 +FORMULA_TRANSPARENT = YES +USE_MATHJAX = NO +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest +MATHJAX_EXTENSIONS = +SEARCHENGINE = NO +SERVER_BASED_SEARCH = NO +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = latex +MAKEINDEX_CMD_NAME = makeindex +COMPACT_LATEX = YES +PAPER_TYPE = a4wide +EXTRA_PACKAGES = +LATEX_HEADER = +LATEX_FOOTER = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +LATEX_SOURCE_CODE = NO +LATEX_BIB_STYLE = plain +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = NO +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_LINKS = NO +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- +GENERATE_XML = NO +XML_OUTPUT = xml +XML_SCHEMA = +XML_DTD = +XML_PROGRAMLISTING = YES +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- +GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- +TAGFILES = c_utils.tag=../c_utils +GENERATE_TAGFILE = libfftpack.tag +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +PERL_PATH = /usr/bin/perl +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- +CLASS_DIAGRAMS = YES +MSCGEN_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = NO +DOT_NUM_THREADS = 0 +DOT_FONTNAME = FreeSans +DOT_FONTSIZE = 10 +DOT_FONTPATH = +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = NO +UML_LIMIT_NUM_FIELDS = 10 +TEMPLATE_RELATIONS = YES +INCLUDE_GRAPH = NO +INCLUDED_BY_GRAPH = NO +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = NO +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +INTERACTIVE_SVG = NO +DOT_PATH = +DOTFILE_DIRS = +MSCFILE_DIRS = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES diff --git a/docsrc/libsharp.dox b/docsrc/libsharp.dox new file mode 100644 index 0000000..b476ab4 --- /dev/null +++ b/docsrc/libsharp.dox @@ -0,0 +1,291 @@ +# Doxyfile 1.8.1 + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = "LevelS SHT library" +PROJECT_NUMBER = 0.1 +PROJECT_BRIEF = +PROJECT_LOGO = +OUTPUT_DIRECTORY = . +CREATE_SUBDIRS = NO +OUTPUT_LANGUAGE = English +BRIEF_MEMBER_DESC = NO +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = NO +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = NO +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 8 +ALIASES = +TCL_SUBST = +OPTIMIZE_OUTPUT_FOR_C = YES +OPTIMIZE_OUTPUT_JAVA = NO +OPTIMIZE_FOR_FORTRAN = NO +OPTIMIZE_OUTPUT_VHDL = NO +EXTENSION_MAPPING = +MARKDOWN_SUPPORT = YES +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +SIP_SUPPORT = NO +IDL_PROPERTY_SUPPORT = YES +DISTRIBUTE_GROUP_DOC = NO +SUBGROUPING = YES +INLINE_GROUPED_CLASSES = NO +INLINE_SIMPLE_STRUCTS = NO +TYPEDEF_HIDES_STRUCT = NO +SYMBOL_CACHE_SIZE = 0 +LOOKUP_CACHE_SIZE = 0 +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_PACKAGE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +HIDE_UNDOC_MEMBERS = YES +HIDE_UNDOC_CLASSES = YES +HIDE_FRIEND_COMPOUNDS = YES +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +SHOW_INCLUDE_FILES = YES +FORCE_LOCAL_INCLUDES = NO +INLINE_INFO = YES +SORT_MEMBER_DOCS = NO +SORT_BRIEF_DOCS = NO +SORT_MEMBERS_CTORS_1ST = NO +SORT_GROUP_NAMES = NO +SORT_BY_SCOPE_NAME = NO +STRICT_PROTO_MATCHING = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_FILES = YES +SHOW_NAMESPACES = YES +FILE_VERSION_FILTER = +LAYOUT_FILE = +CITE_BIB_FILES = +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- +QUIET = YES +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- +INPUT = ../libsharp +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.h \ + *.c \ + *.dox +RECURSIVE = YES +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = +EXAMPLE_PATH = +EXAMPLE_PATTERNS = +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +FILTER_SOURCE_PATTERNS = +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- +SOURCE_BROWSER = YES +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = NO +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +REFERENCES_LINK_SOURCE = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- +GENERATE_HTML = YES +HTML_OUTPUT = htmldoc +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = footer.html +HTML_STYLESHEET = +HTML_EXTRA_FILES = +HTML_COLORSTYLE_HUE = 220 +HTML_COLORSTYLE_SAT = 100 +HTML_COLORSTYLE_GAMMA = 80 +HTML_TIMESTAMP = YES +HTML_DYNAMIC_SECTIONS = NO +HTML_INDEX_NUM_ENTRIES = 100 +GENERATE_DOCSET = NO +DOCSET_FEEDNAME = "Doxygen generated docs" +DOCSET_BUNDLE_ID = org.doxygen.Project +DOCSET_PUBLISHER_ID = org.doxygen.Publisher +DOCSET_PUBLISHER_NAME = Publisher +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +CHM_INDEX_ENCODING = +BINARY_TOC = NO +TOC_EXPAND = NO +GENERATE_QHP = NO +QCH_FILE = +QHP_NAMESPACE = org.doxygen.Project +QHP_VIRTUAL_FOLDER = doc +QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = +QHG_LOCATION = +GENERATE_ECLIPSEHELP = NO +ECLIPSE_DOC_ID = org.doxygen.Project +DISABLE_INDEX = NO +GENERATE_TREEVIEW = NO +ENUM_VALUES_PER_LINE = 4 +TREEVIEW_WIDTH = 250 +EXT_LINKS_IN_WINDOW = NO +FORMULA_FONTSIZE = 10 +FORMULA_TRANSPARENT = YES +USE_MATHJAX = NO +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest +MATHJAX_EXTENSIONS = +SEARCHENGINE = NO +SERVER_BASED_SEARCH = NO +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = latex +MAKEINDEX_CMD_NAME = makeindex +COMPACT_LATEX = YES +PAPER_TYPE = a4wide +EXTRA_PACKAGES = +LATEX_HEADER = +LATEX_FOOTER = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +LATEX_SOURCE_CODE = NO +LATEX_BIB_STYLE = plain +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = NO +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_LINKS = NO +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- +GENERATE_XML = NO +XML_OUTPUT = xml +XML_SCHEMA = +XML_DTD = +XML_PROGRAMLISTING = YES +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- +GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- +TAGFILES = libfftpack.tag=../libfftpack \ + c_utils.tag=../c_utils +GENERATE_TAGFILE = libsharp.tag +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +PERL_PATH = /usr/bin/perl +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- +CLASS_DIAGRAMS = YES +MSCGEN_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = NO +DOT_NUM_THREADS = 0 +DOT_FONTNAME = FreeSans +DOT_FONTSIZE = 10 +DOT_FONTPATH = +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = NO +UML_LIMIT_NUM_FIELDS = 10 +TEMPLATE_RELATIONS = YES +INCLUDE_GRAPH = NO +INCLUDED_BY_GRAPH = NO +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = NO +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +INTERACTIVE_SVG = NO +DOT_PATH = +DOTFILE_DIRS = +MSCFILE_DIRS = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES diff --git a/docsrc/planck.make b/docsrc/planck.make new file mode 100644 index 0000000..0d0a462 --- /dev/null +++ b/docsrc/planck.make @@ -0,0 +1,20 @@ +PKG:=docsrc + +docsrc_idx: $(DOCDIR)_mkdir + cp $(SRCROOT)/docsrc/index_code.html $(DOCDIR)/index.html + +docsrc_code_doc: $(DOCDIR)_mkdir docsrc_idx + cd $(SRCROOT)/docsrc; \ + for i in c_utils libfftpack libsharp; do \ + doxygen $${i}.dox; \ + rm -rf $(DOCDIR)/$${i}; mv htmldoc $(DOCDIR)/$${i}; \ + done; \ + rm *.tag; + +docsrc_clean: + cd $(SRCROOT)/docsrc; \ + rm -f *.tag + cd $(SRCROOT)/docsrc; \ + rm -rf htmldoc + +doc: docsrc_code_doc diff --git a/libfftpack/README b/libfftpack/README new file mode 100644 index 0000000..2c7e7cb --- /dev/null +++ b/libfftpack/README @@ -0,0 +1,34 @@ +ls_fft description: + +This package is intended to calculate one-dimensional real or complex FFTs +with high accuracy and good efficiency even for lengths containing large +prime factors. +The code is written in C, but a Fortran wrapper exists as well. + +Before any FFT is executed, a plan must be generated for it. Plan creation +is designed to be fast, so that there is no significant overhead if the +plan is only used once or a few times. + +The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the +double precision incarnation by Hugh C. Pumphrey +(http://www.netlib.org/fftpack/dp.tgz). + +I replaced the iterative sine and cosine calculations in radfg() and radbg() +by an exact calculation, which slightly improves the transform accuracy for +real FFTs with lengths containing large prime factors. + +Since FFTPACK becomes quite slow for FFT lengths with large prime factors +(in the worst case of prime lengths it reaches O(n*n) complexity), I +implemented Bluestein's algorithm, which computes a FFT of length n by +several FFTs of length n2>=2*n-1 and a convolution. Since n2 can be chosen +to be highly composite, this algorithm is more efficient if n has large +prime factors. The longer FFTs themselves are then computed using the FFTPACK +routines. +Bluestein's algorithm was implemented according to the description at +http://en.wikipedia.org/wiki/Bluestein's_FFT_algorithm. + +Thread-safety: +All routines can be called concurrently; all information needed by ls_fft +is stored in the plan variable. However, using the same plan variable on +multiple threads simultaneously is not supported and will lead to data +corruption. diff --git a/libfftpack/bluestein.c b/libfftpack/bluestein.c new file mode 100644 index 0000000..2e2005c --- /dev/null +++ b/libfftpack/bluestein.c @@ -0,0 +1,173 @@ +/* + * This file is part of libfftpack. + * + * libfftpack is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libfftpack is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libfftpack; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + * Copyright (C) 2005, 2006, 2007, 2008 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include +#include "fftpack.h" +#include "bluestein.h" + +/* returns the sum of all prime factors of n */ +size_t prime_factor_sum (size_t n) + { + size_t result=0,x,limit,tmp; + while (((tmp=(n>>1))<<1)==n) + { result+=2; n=tmp; } + + limit=(size_t)sqrt(n+0.01); + for (x=3; x<=limit; x+=2) + while ((tmp=(n/x))*x==n) + { + result+=x; + n=tmp; + limit=(size_t)sqrt(n+0.01); + } + if (n>1) result+=n; + + return result; + } + +/* returns the smallest composite of 2, 3 and 5 which is >= n */ +static size_t good_size(size_t n) + { + size_t f2, f23, f235, bestfac=2*n; + if (n<=6) return n; + + for (f2=1; f2=n) bestfac=f235; + return bestfac; + } + +void bluestein_i (size_t n, double **tstorage, size_t *worksize) + { + static const double pi=3.14159265358979323846; + size_t n2=good_size(n*2-1); + size_t m, coeff; + double angle, xn2; + double *bk, *bkf, *work; + double pibyn=pi/n; + *worksize=2+2*n+8*n2+16; + *tstorage = RALLOC(double,2+2*n+8*n2+16); + ((size_t *)(*tstorage))[0]=n2; + bk = *tstorage+2; + bkf = *tstorage+2+2*n; + work= *tstorage+2+2*(n+n2); + +/* initialize b_k */ + bk[0] = 1; + bk[1] = 0; + + coeff=0; + for (m=1; m=2*n) coeff-=2*n; + angle = pibyn*coeff; + bk[2*m] = cos(angle); + bk[2*m+1] = sin(angle); + } + +/* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + xn2 = 1./n2; + bkf[0] = bk[0]*xn2; + bkf[1] = bk[1]*xn2; + for (m=2; m<2*n; m+=2) + { + bkf[m] = bkf[2*n2-m] = bk[m] *xn2; + bkf[m+1] = bkf[2*n2-m+1] = bk[m+1] *xn2; + } + for (m=2*n;m<=(2*n2-2*n+1);++m) + bkf[m]=0.; + cffti (n2,work); + cfftf (n2,bkf,work); + } + +void bluestein (size_t n, double *data, double *tstorage, int isign) + { + size_t n2=*((size_t *)tstorage); + size_t m; + double *bk, *bkf, *akf, *work; + bk = tstorage+2; + bkf = tstorage+2+2*n; + work= tstorage+2+2*(n+n2); + akf = tstorage+2+2*n+6*n2+16; + +/* initialize a_k and FFT it */ + if (isign>0) + for (m=0; m<2*n; m+=2) + { + akf[m] = data[m]*bk[m] - data[m+1]*bk[m+1]; + akf[m+1] = data[m]*bk[m+1] + data[m+1]*bk[m]; + } + else + for (m=0; m<2*n; m+=2) + { + akf[m] = data[m]*bk[m] + data[m+1]*bk[m+1]; + akf[m+1] =-data[m]*bk[m+1] + data[m+1]*bk[m]; + } + for (m=2*n; m<2*n2; ++m) + akf[m]=0; + + cfftf (n2,akf,work); + +/* do the convolution */ + if (isign>0) + for (m=0; m<2*n2; m+=2) + { + double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + else + for (m=0; m<2*n2; m+=2) + { + double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m]; + akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1]; + akf[m+1] = im; + } + + +/* inverse FFT */ + cfftb (n2,akf,work); + +/* multiply by b_k* */ + if (isign>0) + for (m=0; m<2*n; m+=2) + { + data[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1]; + data[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + else + for (m=0; m<2*n; m+=2) + { + data[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1]; + data[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1]; + } + } diff --git a/libfftpack/bluestein.h b/libfftpack/bluestein.h new file mode 100644 index 0000000..91e5b28 --- /dev/null +++ b/libfftpack/bluestein.h @@ -0,0 +1,48 @@ +/* + * This file is part of libfftpack. + * + * libfftpack is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libfftpack is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libfftpack; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + * Copyright (C) 2005 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_BLUESTEIN_H +#define PLANCK_BLUESTEIN_H + +#include "c_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +size_t prime_factor_sum (size_t n); + +void bluestein_i (size_t n, double **tstorage, size_t *worksize); +void bluestein (size_t n, double *data, double *tstorage, int isign); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libfftpack/fftpack.c b/libfftpack/fftpack.c new file mode 100644 index 0000000..6d09d06 --- /dev/null +++ b/libfftpack/fftpack.c @@ -0,0 +1,833 @@ +/* + * This file is part of libfftpack. + * + * libfftpack is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libfftpack is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libfftpack; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + fftpack.c : A set of FFT routines in C. + Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber + (Version 4, 1985). + + C port by Martin Reinecke (2010) + */ + +#include +#include +#include +#include "fftpack.h" + +#define WA(x,i) wa[(i)+(x)*ido] +#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))] +#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))] +#define PM(a,b,c,d) { a=c+d; b=c-d; } +#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; } +#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; } +#define SCALEC(a,b) { a.r*=b; a.i*=b; } +#define CONJFLIPC(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; } +/* (a+ib) = conj(c+id) * (e+if) */ +#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; } + +typedef struct { + double r,i; +} cmplx; + +#define CONCAT(a,b) a ## b + +#define X(arg) CONCAT(passb,arg) +#define BACKWARD +#include "fftpack_inc.c" +#undef BACKWARD +#undef X + +#define X(arg) CONCAT(passf,arg) +#include "fftpack_inc.c" +#undef X + +#undef CC +#undef CH +#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))] +#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))] + +static void radf2 (size_t ido, size_t l1, const double *cc, double *ch, + const double *wa) + { + const size_t cdim=2; + size_t i, k, ic; + double ti2, tr2; + + for (k=0; k=2*ip) aidx-=2*ip; + ar2=csarr[aidx]; + ai2=csarr[aidx+1]; + for(ik=0; ik=2*ip) aidx-=2*ip; + ar2=csarr[aidx]; + ai2=csarr[aidx+1]; + for(ik=0; ik0) ? passb4(ido, l1, p1, p2, wa+iw) + : passf4(ido, l1, p1, p2, wa+iw); + else if(ip==2) + (isign>0) ? passb2(ido, l1, p1, p2, wa+iw) + : passf2(ido, l1, p1, p2, wa+iw); + else if(ip==3) + (isign>0) ? passb3(ido, l1, p1, p2, wa+iw) + : passf3(ido, l1, p1, p2, wa+iw); + else if(ip==5) + (isign>0) ? passb5(ido, l1, p1, p2, wa+iw) + : passf5(ido, l1, p1, p2, wa+iw); + else if(ip==6) + (isign>0) ? passb6(ido, l1, p1, p2, wa+iw) + : passf6(ido, l1, p1, p2, wa+iw); + else + (isign>0) ? passbg(ido, ip, l1, p1, p2, wa+iw) + : passfg(ido, ip, l1, p1, p2, wa+iw); + SWAP(p1,p2,cmplx *); + l1=l2; + iw+=(ip-1)*ido; + } + if (p1!=c) + memcpy (c,p1,n*sizeof(cmplx)); + } + +void cfftf(size_t n, double c[], double wsave[]) + { + if (n!=1) + cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n), + (size_t*)(wsave+4*n),-1); + } + +void cfftb(size_t n, double c[], double wsave[]) + { + if (n!=1) + cfft1(n, (cmplx*)c, (cmplx*)wsave, (cmplx*)(wsave+2*n), + (size_t*)(wsave+4*n),+1); + } + +static void factorize (size_t n, const size_t *pf, size_t npf, size_t *ifac) + { + size_t nl=n, nf=0, ntry=0, j=0, i; + +startloop: + j++; + ntry = (j<=npf) ? pf[j-1] : ntry+2; + do + { + size_t nq=nl / ntry; + size_t nr=nl-ntry*nq; + if (nr!=0) + goto startloop; + nf++; + ifac[nf+1]=ntry; + nl=nq; + if ((ntry==2) && (nf!=1)) + { + for (i=nf+1; i>2; --i) + ifac[i]=ifac[i-1]; + ifac[2]=2; + } + } + while(nl!=1); + ifac[0]=n; + ifac[1]=nf; + } + +static void cffti1(size_t n, double wa[], size_t ifac[]) + { + static const size_t ntryh[5]={4,6,3,2,5}; + static const double twopi=6.28318530717958647692; + size_t j, k, fi; + + double argh=twopi/n; + size_t i=0, l1=1; + factorize (n,ntryh,5,ifac); + for(k=1; k<=ifac[1]; k++) + { + size_t ip=ifac[k+1]; + size_t ido=n/(l1*ip); + for(j=1; j6) + { + wa[is ]=wa[i ]; + wa[is+1]=wa[i+1]; + } + } + l1*=ip; + } + } + +void cffti(size_t n, double wsave[]) + { if (n!=1) cffti1(n, wsave+2*n,(size_t*)(wsave+4*n)); } + + +/*---------------------------------------------------------------------- + rfftf1, rfftb1, rfftf, rfftb, rffti1, rffti. Real FFTs. + ----------------------------------------------------------------------*/ + +static void rfftf1(size_t n, double c[], double ch[], const double wa[], + const size_t ifac[]) + { + size_t k1, l1=n, nf=ifac[1], iw=n-1; + double *p1=ch, *p2=c; + + for(k1=1; k1<=nf;++k1) + { + size_t ip=ifac[nf-k1+2]; + size_t ido=n / l1; + l1 /= ip; + iw-=(ip-1)*ido; + SWAP (p1,p2,double *); + if(ip==4) + radf4(ido, l1, p1, p2, wa+iw); + else if(ip==2) + radf2(ido, l1, p1, p2, wa+iw); + else if(ip==3) + radf3(ido, l1, p1, p2, wa+iw); + else if(ip==5) + radf5(ido, l1, p1, p2, wa+iw); + else + { + if (ido==1) + SWAP (p1,p2,double *); + radfg(ido, ip, l1, ido*l1, p1, p2, wa+iw); + SWAP (p1,p2,double *); + } + } + if (p1==c) + memcpy (c,ch,n*sizeof(double)); + } + +static void rfftb1(size_t n, double c[], double ch[], const double wa[], + const size_t ifac[]) + { + size_t k1, l1=1, nf=ifac[1], iw=0; + double *p1=c, *p2=ch; + + for(k1=1; k1<=nf; k1++) + { + size_t ip = ifac[k1+1], + ido= n/(ip*l1); + if(ip==4) + radb4(ido, l1, p1, p2, wa+iw); + else if(ip==2) + radb2(ido, l1, p1, p2, wa+iw); + else if(ip==3) + radb3(ido, l1, p1, p2, wa+iw); + else if(ip==5) + radb5(ido, l1, p1, p2, wa+iw); + else + { + radbg(ido, ip, l1, ido*l1, p1, p2, wa+iw); + if (ido!=1) + SWAP (p1,p2,double *); + } + SWAP (p1,p2,double *); + l1*=ip; + iw+=(ip-1)*ido; + } + if (p1!=c) + memcpy (c,ch,n*sizeof(double)); + } + +void rfftf(size_t n, double r[], double wsave[]) + { if(n!=1) rfftf1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); } + +void rfftb(size_t n, double r[], double wsave[]) + { if(n!=1) rfftb1(n, r, wsave, wsave+n,(size_t*)(wsave+2*n)); } + +static void rffti1(size_t n, double wa[], size_t ifac[]) + { + static const size_t ntryh[4]={4,2,3,5}; + static const double twopi=6.28318530717958647692; + size_t i, j, k, fi; + + double argh=twopi/n; + size_t is=0, l1=1; + factorize (n,ntryh,4,ifac); + for (k=1; kip) iang-=ip; + abr.r += ccl[l ].r*wal[iang].r; + abr.i += ccl[l ].i*wal[iang].r; + abi.r += ccl[lc].r*wal[iang].i; + abi.i += ccl[lc].i*wal[iang].i; + } +#ifndef BACKWARD + { abi.i=-abi.i; abi.r=-abi.r; } +#endif + CONJFLIPC(abi) + PMC(CH(i,k,j),CH(i,k,jc),abr,abi) + } + } + + DEALLOC(tarr); + + if (ido==1) return; + + for (j=1; j +
  • \ref fftgroup "Programming interface" + + */ diff --git a/libfftpack/ls_fft.c b/libfftpack/ls_fft.c new file mode 100644 index 0000000..b1c0c96 --- /dev/null +++ b/libfftpack/ls_fft.c @@ -0,0 +1,291 @@ +/* + * This file is part of libfftpack. + * + * libfftpack is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libfftpack is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libfftpack; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + * Copyright (C) 2005 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include +#include +#include "bluestein.h" +#include "fftpack.h" +#include "ls_fft.h" + +complex_plan make_complex_plan (size_t length) + { + complex_plan plan = RALLOC(complex_plan_i,1); + size_t pfsum = prime_factor_sum(length); + double comp1 = (double)(length*pfsum); + double comp2 = 2*3*length*log(3.*length); + comp2*=3.; /* fudge factor that appears to give good overall performance */ + plan->length=length; + plan->bluestein = (comp2bluestein) + bluestein_i (length,&(plan->work),&(plan->worksize)); + else + { + plan->worksize=4*length+15; + plan->work=RALLOC(double,4*length+15); + cffti(length, plan->work); + } + return plan; + } + +complex_plan copy_complex_plan (complex_plan plan) + { + if (!plan) return NULL; + { + complex_plan newplan = RALLOC(complex_plan_i,1); + *newplan = *plan; + newplan->work=RALLOC(double,newplan->worksize); + memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize); + return newplan; + } + } + +void kill_complex_plan (complex_plan plan) + { + DEALLOC(plan->work); + DEALLOC(plan); + } + +void complex_plan_forward (complex_plan plan, double *data) + { + if (plan->bluestein) + bluestein (plan->length, data, plan->work, -1); + else + cfftf (plan->length, data, plan->work); + } + +void complex_plan_backward (complex_plan plan, double *data) + { + if (plan->bluestein) + bluestein (plan->length, data, plan->work, 1); + else + cfftb (plan->length, data, plan->work); + } + + +real_plan make_real_plan (size_t length) + { + real_plan plan = RALLOC(real_plan_i,1); + size_t pfsum = prime_factor_sum(length); + double comp1 = .5*length*pfsum; + double comp2 = 2*3*length*log(3.*length); + comp2*=3; /* fudge factor that appears to give good overall performance */ + plan->length=length; + plan->bluestein = (comp2bluestein) + bluestein_i (length,&(plan->work),&(plan->worksize)); + else + { + plan->worksize=2*length+15; + plan->work=RALLOC(double,2*length+15); + rffti(length, plan->work); + } + return plan; + } + +real_plan copy_real_plan (real_plan plan) + { + if (!plan) return NULL; + { + real_plan newplan = RALLOC(real_plan_i,1); + *newplan = *plan; + newplan->work=RALLOC(double,newplan->worksize); + memcpy(newplan->work,plan->work,sizeof(double)*newplan->worksize); + return newplan; + } + } + +void kill_real_plan (real_plan plan) + { + DEALLOC(plan->work); + DEALLOC(plan); + } + +void real_plan_forward_fftpack (real_plan plan, double *data) + { + if (plan->bluestein) + { + size_t m; + size_t n=plan->length; + double *tmp = RALLOC(double,2*n); + for (m=0; mwork,-1); + data[0] = tmp[0]; + memcpy (data+1, tmp+2, (n-1)*sizeof(double)); + DEALLOC(tmp); + } + else + rfftf (plan->length, data, plan->work); + } + +static void fftpack2halfcomplex (double *data, size_t n) + { + size_t m; + double *tmp = RALLOC(double,n); + tmp[0]=data[0]; + for (m=1; m<(n+1)/2; ++m) + { + tmp[m]=data[2*m-1]; + tmp[n-m]=data[2*m]; + } + if (!(n&1)) + tmp[n/2]=data[n-1]; + memcpy (data,tmp,n*sizeof(double)); + DEALLOC(tmp); + } + +static void halfcomplex2fftpack (double *data, size_t n) + { + size_t m; + double *tmp = RALLOC(double,n); + tmp[0]=data[0]; + for (m=1; m<(n+1)/2; ++m) + { + tmp[2*m-1]=data[m]; + tmp[2*m]=data[n-m]; + } + if (!(n&1)) + tmp[n-1]=data[n/2]; + memcpy (data,tmp,n*sizeof(double)); + DEALLOC(tmp); + } + +void real_plan_forward_fftw (real_plan plan, double *data) + { + real_plan_forward_fftpack (plan, data); + fftpack2halfcomplex (data,plan->length); + } + +void real_plan_backward_fftpack (real_plan plan, double *data) + { + if (plan->bluestein) + { + size_t m; + size_t n=plan->length; + double *tmp = RALLOC(double,2*n); + tmp[0]=data[0]; + tmp[1]=0.; + memcpy (tmp+2,data+1, (n-1)*sizeof(double)); + if ((n&1)==0) tmp[n+1]=0.; + for (m=2; mwork, 1); + for (m=0; mlength, data, plan->work); + } + +void real_plan_backward_fftw (real_plan plan, double *data) + { + halfcomplex2fftpack (data,plan->length); + real_plan_backward_fftpack (plan, data); + } + +void real_plan_forward_c (real_plan plan, double *data) + { + size_t m; + size_t n=plan->length; + + if (plan->bluestein) + { + for (m=1; m<2*n; m+=2) + data[m]=0; + bluestein (plan->length, data, plan->work, -1); + data[1]=0; + for (m=2; mwork); + data[0] = data[1]; + data[1] = 0; + for (m=2; mlength; + + if (plan->bluestein) + { + size_t m; + data[1]=0; + for (m=2; mlength, data, plan->work, 1); + for (m=1; m<2*n; m+=2) + data[m]=0; + } + else + { + ptrdiff_t m; + data[1] = data[0]; + rfftb (n, data+1, plan->work); + for (m=n-1; m>=0; --m) + { + data[2*m] = data[m+1]; + data[2*m+1] = 0.; + } + } + } diff --git a/libfftpack/ls_fft.h b/libfftpack/ls_fft.h new file mode 100644 index 0000000..8675454 --- /dev/null +++ b/libfftpack/ls_fft.h @@ -0,0 +1,162 @@ +/* + * This file is part of libfftpack. + * + * libfftpack is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libfftpack is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libfftpack; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libfftpack is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file ls_fft.h + * Interface for the LevelS FFT package. + * + * Copyright (C) 2004 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_LS_FFT_H +#define PLANCK_LS_FFT_H + +#include "c_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\defgroup fftgroup FFT interface +This package is intended to calculate one-dimensional real or complex FFTs +with high accuracy and good efficiency even for lengths containing large +prime factors. +The code is written in C, but a Fortran wrapper exists as well. + +Before any FFT is executed, a plan must be generated for it. Plan creation +is designed to be fast, so that there is no significant overhead if the +plan is only used once or a few times. + +The main component of the code is based on Paul N. Swarztrauber's FFTPACK in the +double precision incarnation by Hugh C. Pumphrey +(http://www.netlib.org/fftpack/dp.tgz). + +I replaced the iterative sine and cosine calculations in radfg() and radbg() +by an exact calculation, which slightly improves the transform accuracy for +real FFTs with lengths containing large prime factors. + +Since FFTPACK becomes quite slow for FFT lengths with large prime factors +(in the worst case of prime lengths it reaches \f$\mathcal{O}(n^2)\f$ +complexity), I implemented Bluestein's algorithm, which computes a FFT of length +\f$n\f$ by several FFTs of length \f$n_2\ge 2n-1\f$ and a convolution. Since +\f$n_2\f$ can be chosen to be highly composite, this algorithm is more efficient +if \f$n\f$ has large prime factors. The longer FFTs themselves are then computed +using the FFTPACK routines. +Bluestein's algorithm was implemented according to the description on Wikipedia +( +http://en.wikipedia.org/wiki/Bluestein%27s_FFT_algorithm). + +\b Thread-safety: +All routines can be called concurrently; all information needed by +ls_fft is stored in the plan variable. However, using the same plan +variable on multiple threads simultaneously is not supported and will lead to +data corruption. +*/ +/*! \{ */ + +typedef struct + { + double *work; + size_t length, worksize; + int bluestein; + } complex_plan_i; + +/*! The opaque handle type for complex-FFT plans. */ +typedef complex_plan_i * complex_plan; + +/*! Returns a plan for a complex FFT with \a length elements. */ +complex_plan make_complex_plan (size_t length); +/*! Constructs a copy of \a plan. */ +complex_plan copy_complex_plan (complex_plan plan); +/*! Destroys a plan for a complex FFT. */ +void kill_complex_plan (complex_plan plan); +/*! Computes a complex forward FFT on \a data, using \a plan. + \a Data has the form r0, i0, r1, i1, ..., + r[length-1], i[length-1]. */ +void complex_plan_forward (complex_plan plan, double *data); +/*! Computes a complex backward FFT on \a data, using \a plan. + \a Data has the form r0, i0, r1, i1, ..., + r[length-1], i[length-1]. */ +void complex_plan_backward (complex_plan plan, double *data); + +typedef struct + { + double *work; + size_t length, worksize; + int bluestein; + } real_plan_i; + +/*! The opaque handle type for real-FFT plans. */ +typedef real_plan_i * real_plan; + +/*! Returns a plan for a real FFT with \a length elements. */ +real_plan make_real_plan (size_t length); +/*! Constructs a copy of \a plan. */ +real_plan copy_real_plan (real_plan plan); +/*! Destroys a plan for a real FFT. */ +void kill_real_plan (real_plan plan); +/*! Computes a real forward FFT on \a data, using \a plan + and assuming the FFTPACK storage scheme: + - on entry, \a data has the form r0, r1, ..., r[length-1]; + - on exit, it has the form r0, r1, i1, r2, i2, ... + (a total of \a length values). */ +void real_plan_forward_fftpack (real_plan plan, double *data); +/*! Computes a real forward FFT on \a data, using \a plan + and assuming the FFTPACK storage scheme: + - on entry, \a data has the form r0, r1, i1, r2, i2, ... + (a total of \a length values); + - on exit, it has the form r0, r1, ..., r[length-1]. */ +void real_plan_backward_fftpack (real_plan plan, double *data); +/*! Computes a real forward FFT on \a data, using \a plan + and assuming the FFTW halfcomplex storage scheme: + - on entry, \a data has the form r0, r1, ..., r[length-1]; + - on exit, it has the form r0, r1, r2, ..., i2, i1. */ +void real_plan_forward_fftw (real_plan plan, double *data); +/*! Computes a real backward FFT on \a data, using \a plan + and assuming the FFTW halfcomplex storage scheme: + - on entry, \a data has the form r0, r1, r2, ..., i2, i1. + - on exit, it has the form r0, r1, ..., r[length-1]. */ +void real_plan_backward_fftw (real_plan plan, double *data); +/*! Computes a real forward FFT on \a data, using \a plan + and assuming a full-complex storage scheme: + - on entry, \a data has the form r0, [ignored], r1, [ignored], ..., + r[length-1], [ignored]; + - on exit, it has the form r0, i0, r1, i1, ..., + r[length-1], i[length-1]. + */ +void real_plan_forward_c (real_plan plan, double *data); +/*! Computes a real backward FFT on \a data, using \a plan + and assuming a full-complex storage scheme: + - on entry, \a data has the form r0, i0, r1, i1, ..., + r[length-1], i[length-1]; + - on exit, it has the form r0, 0, r1, 0, ..., r[length-1], 0. */ +void real_plan_backward_c (real_plan plan, double *data); + +/*! \} */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libfftpack/planck.make b/libfftpack/planck.make new file mode 100644 index 0000000..c171367 --- /dev/null +++ b/libfftpack/planck.make @@ -0,0 +1,21 @@ +PKG:=libfftpack + +SD:=$(SRCROOT)/$(PKG) +OD:=$(BLDROOT)/$(PKG) + +FULL_INCLUDE+= -I$(SD) + +HDR_$(PKG):=$(SD)/*.h +LIB_$(PKG):=$(LIBDIR)/libfftpack.a +OBJ:=fftpack.o bluestein.o ls_fft.o +OBJ:=$(OBJ:%=$(OD)/%) + +ODEP:=$(HDR_$(PKG)) $(HDR_c_utils) + +$(OD)/fftpack.o: $(SD)/fftpack_inc.c + +$(OBJ): $(ODEP) | $(OD)_mkdir +$(LIB_$(PKG)): $(OBJ) + +all_hdr+=$(HDR_$(PKG)) +all_lib+=$(LIB_$(PKG)) diff --git a/libsharp/complex_hacks.h b/libsharp/complex_hacks.h new file mode 100644 index 0000000..99a7c2b --- /dev/null +++ b/libsharp/complex_hacks.h @@ -0,0 +1,131 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* \file complex_hacks.h + * support for converting vector types and complex numbers + * + * Copyright (C) 2012 Max-Planck-Society + * Author: Martin Reinecke + */ + +#ifndef COMPLEX_HACKS_H +#define COMPLEX_HACKS_H + +#include +#include +#include "vecsupport.h" + +#define UNSAFE_CODE + +#if (VLEN==1) + +static inline complex double vhsum_cmplx(Tv a, Tv b) + { return a+_Complex_I*b; } + +static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, + complex double * restrict c1, complex double * restrict c2) + { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; } + +#endif + +#if (VLEN==2) + +static inline complex double vhsum_cmplx (Tv a, Tv b) + { +#if defined(__SSE3__) + Tv tmp = _mm_hadd_pd(a,b); +#else + Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)), + _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))); +#endif + union {Tv v; complex double c; } u; + u.v=tmp; return u.c; + } + +static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, + Tv d, complex double * restrict c1, complex double * restrict c2) + { +#ifdef UNSAFE_CODE +#if defined(__SSE3__) + vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b)); + vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d)); +#else + vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)), + _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)))); + vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)), + _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)))); +#endif +#else + union {Tv v; complex double c; } u1, u2; +#if defined(__SSE3__) + u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d); +#else + u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)), + _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))); + u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)), + _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))); +#endif + *c1+=u1.c; *c2+=u2.c; +#endif + } + +#endif + +#if (VLEN==4) + +static inline complex double vhsum_cmplx (Tv a, Tv b) + { + Tv tmp=_mm256_hadd_pd(a,b); + Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1); + tmp=_mm256_add_pd(tmp,tmp2); +#ifdef UNSAFE_CODE + complex double ret; + *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0); + return ret; +#else + union {Tv v; complex double c[2]; } u; + u.v=tmp; return u.c[0]; +#endif + } + +static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d, + complex double * restrict c1, complex double * restrict c2) + { + Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d); + Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), + tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); + tmp1=vadd(tmp3,tmp4); +#ifdef UNSAFE_CODE + *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0)); + *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1)); +#else + union {Tv v; complex double c[2]; } u; + u.v=tmp1; + *c1+=u.c[0]; *c2+=u.c[1]; +#endif + } + +#endif + +#endif diff --git a/libsharp/libsharp.dox b/libsharp/libsharp.dox new file mode 100644 index 0000000..2a5067c --- /dev/null +++ b/libsharp/libsharp.dox @@ -0,0 +1,94 @@ +/*! \mainpage libsharp documentation + + */ + +/*! \page introduction Introduction to libsharp + + "SHARP" is an acronym for Performant Spherical Harmonic Transforms. + All user-visible data types and functions in this library start with + the prefix "sharp_", or with "sharps_" and "sharpd_" for single- and + double precision variants, respectively. + + libsharp's main functionality is the conversion between maps + on the sphere and spherical harmonic coefficients (or a_lm). + A map is defined as a set of rings, which in turn consist of + individual pixels that +
      +
    • all have the same colatitude and
    • +
    • are uniformly spaced in azimuthal direction.
    • +
    + Consequently, a ring is completely defined by +
      +
    • its colatitute (in radians)
    • +
    • the number of pixels it contains
    • +
    • the azimuth (in radians) of the first pixel in the ring
    • +
    • the weight that must be multiplied to every pixel during a map + analysis (typically the solid angle of a pixel in the ring)
    • +
    • the offset of the first ring pixel in the map array
    • +
    • the stride between consecutive pixels in the ring.
    • +
    + The map array is a one-dimensional array of type float or + double, which contains the values of all map pixels. It is assumed + that the pixels of every ring are stored inside this array in order of + increasing azimuth and with the specified stride. Note however that the rings + themselves can be stored in any order inside the array. + + The a_lm array is a one-dimensional array of type complex float or + complex double, which contains all spherical harmonic coefficients + for a full or partial set of m quantum numbers with 0<=m<=mmax and m<=l<=lmax. + There is only one constraint on the internal structure of the array, which is: + + Index[a_l+1,m] = Index[a_l,m] + stride + + That means that coefficients with identical m but different l + can be interpreted as a one-dimensional array in l with a unique + stride. + + Several functions are provided for efficient index computation in this array; + they are documented \ref almgroup "here". + + Information about a pixelisation of the sphere is stored in objects of + type sharp_geom_info. It is possible to create such an object for any + supported pixelisation by using the function sharp_make_geometry_info(); + however, several easier-to-use functions are \ref geominfogroup "supplied" + for generating often-used pixelisations like ECP grids, Gaussian grids, + and Healpix grids. + + Currently, SHARP supports the following kinds of transforms: +
      +
    • scalar a_lm to map
    • +
    • scalar map to a_lm
    • + +
    • spin a_lm to map
    • +
    • spin map to a_lm
    • + +
    + + SHARP supports shared-memory parallelisation via OpenMP; this feature will + be automatically enabled if the compiler supports it. + + SHARP will also make use of SSE2 and AVX instructions when compiled for a + platform known to support them. + + Support for MPI-parallel transforms is also available; in this mode, + every MPI task must provide a unique subset of the map and a_lm coefficients. + + The spherical harmonic transforms can be executed on double-precision and + single-precision maps and a_lm, but for accuracy reasons the computations + will always be performed in double precision. As a consequence, + single-precision transforms will most likely not be faster than their + double-precision counterparts, but they will require significantly less + memory. + + Two example and benchmark programs are distributed with SHARP: +
      +
    • sharp_test.c checks the accuracy of the (iterative) map analysis + algorithm
    • +
    • sharp_bench.c determines the quickest transform strategy for a given + SHT
    • +
    +*/ diff --git a/libsharp/oracle.inc b/libsharp/oracle.inc new file mode 100644 index 0000000..7680861 --- /dev/null +++ b/libsharp/oracle.inc @@ -0,0 +1,9 @@ +static const int maxtr = 6; +static const int nv_opt[6][2][3] = { +{{4,2,-1},{2,1,-1}}, +{{4,2,-1},{2,1,-1}}, +{{5,2,-1},{5,2,-1}}, +{{5,2,-1},{5,2,-1}}, +{{5,2,-1},{5,2,-1}}, +{{5,2,-1},{5,2,-1}} +}; diff --git a/libsharp/planck.make b/libsharp/planck.make new file mode 100644 index 0000000..23dd2ad --- /dev/null +++ b/libsharp/planck.make @@ -0,0 +1,29 @@ +PKG:=libsharp + +SD:=$(SRCROOT)/$(PKG) +OD:=$(BLDROOT)/$(PKG) + +FULL_INCLUDE+= -I$(SD) + +HDR_$(PKG):=$(SD)/*.h +LIB_$(PKG):=$(LIBDIR)/libsharp.a +BIN:=sharp_test sharp_acctest sharp_test_mpi sharp_bench +LIBOBJ:=ylmgen_c.o sharp.o sharp_geomhelpers.o sharp_almhelpers.o sharp_core.o +ALLOBJ:=$(LIBOBJ) sharp_test.o sharp_acctest.o sharp_test_mpi.o sharp_bench.o +LIBOBJ:=$(LIBOBJ:%=$(OD)/%) +ALLOBJ:=$(ALLOBJ:%=$(OD)/%) + +ODEP:=$(HDR_$(PKG)) $(HDR_libfftpack) $(HDR_c_utils) +$(OD)/sharp_core.o: $(SD)/sharp_inchelper1.inc.c $(SD)/sharp_core_inc.c $(SD)/sharp_core_inc2.c $(SD)/sharp_core_inc3.c +$(OD)/sharp.o: $(SD)/sharp_mpi.c $(SD)/oracle.inc +BDEP:=$(LIB_$(PKG)) $(LIB_libfftpack) $(LIB_c_utils) + +$(LIB_$(PKG)): $(LIBOBJ) + +$(ALLOBJ): $(ODEP) | $(OD)_mkdir +BIN:=$(BIN:%=$(BINDIR)/%) +$(BIN): $(BINDIR)/% : $(OD)/%.o $(BDEP) + +all_hdr+=$(HDR_$(PKG)) +all_lib+=$(LIB_$(PKG)) +all_cbin+=$(BIN) diff --git a/libsharp/sharp.c b/libsharp/sharp.c new file mode 100644 index 0000000..6f44ffa --- /dev/null +++ b/libsharp/sharp.c @@ -0,0 +1,596 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp.c + * Spherical transform library + * + * Copyright (C) 2006-2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include "ls_fft.h" +#include "ylmgen_c.h" +#include "sharp.h" +#include "c_utils.h" +#include "sharp_core.h" +#include "vec_utils.h" +#include "walltime_c.h" + +typedef complex double dcmplx; +typedef complex float fcmplx; + +static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize) + { + static const int chunksize_min=500, nchunks_max=10; + *chunksize = IMAX(chunksize_min,(ndata+nchunks_max-1)/nchunks_max); + *chunksize = ((*chunksize+nmult-1)/nmult)*nmult; + *nchunks = (ndata+*chunksize-1) / *chunksize; + } + +typedef struct + { + double s; + int i; + } idxhelper; + +static int idx_compare (const void *xa, const void *xb) + { + const idxhelper *a=xa, *b=xb; + return (a->s > b->s) ? -1 : (a->s < b->s) ? 1 : 0; + } + +typedef struct + { + double phi0_; + dcmplx *shiftarr, *work; + int s_shift, s_work; + real_plan plan; + int norot; + } ringhelper; + +static void ringhelper_init (ringhelper *self) + { + static ringhelper rh_null = { 0, NULL, NULL, 0, 0, NULL, 0 }; + *self = rh_null; + } + +static void ringhelper_destroy (ringhelper *self) + { + if (self->plan) kill_real_plan(self->plan); + DEALLOC(self->shiftarr); + DEALLOC(self->work); + ringhelper_init(self); + } + +static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0) + { + self->norot = (fabs(phi0)<1e-14); + if (!(self->norot)) + if ((mmax!=self->s_shift-1) || (!FAPPROX(phi0,self->phi0_,1e-12))) + { + RESIZE (self->shiftarr,dcmplx,mmax+1); + self->s_shift = mmax+1; + self->phi0_ = phi0; + for (int m=0; m<=mmax; ++m) + self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0); + } + if (!self->plan) self->plan=make_real_plan(nph); + if (nph!=(int)self->plan->length) + { + kill_real_plan(self->plan); + self->plan=make_real_plan(nph); + } + GROW(self->work,dcmplx,self->s_work,nph); + } + +static int ringinfo_compare (const void *xa, const void *xb) + { + const sharp_ringinfo *a=xa, *b=xb; + return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0; + } +static int ringpair_compare (const void *xa, const void *xb) + { + const sharp_ringpair *a=xa, *b=xb; + if (a->r1.nph==b->r1.nph) + return (a->r1.phi0 < b->r1.phi0) ? -1 : (a->r1.phi0 > b->r1.phi0) ? 1 : 0; + return (a->r1.nphr1.nph) ? -1 : 1; + } + +void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval, + const ptrdiff_t *mstart, sharp_alm_info **alm_info) + { + sharp_alm_info *info = RALLOC(sharp_alm_info,1); + info->lmax = lmax; + info->nm = nm; + info->mval = RALLOC(int,nm); + info->mvstart = RALLOC(ptrdiff_t,nm); + info->stride = stride; + for (int mi=0; mimval[mi] = mval[mi]; + info->mvstart[mi] = mstart[mi]; + } + *alm_info = info; + } + +void sharp_make_alm_info (int lmax, int mmax, int stride, + const ptrdiff_t *mstart, sharp_alm_info **alm_info) + { + int *mval=RALLOC(int,mmax+1); + for (int i=0; i<=mmax; ++i) + mval[i]=i; + sharp_make_general_alm_info (lmax, mmax+1, stride, mval, mstart, alm_info); + DEALLOC(mval); + } + +ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi) + { return self->mvstart[mi]+self->stride*l; } + +void sharp_destroy_alm_info (sharp_alm_info *info) + { + DEALLOC (info->mval); + DEALLOC (info->mvstart); + DEALLOC (info); + } + +void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs, + const int *stride, const double *phi0, const double *theta, + const double *weight, sharp_geom_info **geom_info) + { + sharp_geom_info *info = RALLOC(sharp_geom_info,1); + sharp_ringinfo *infos = RALLOC(sharp_ringinfo,nrings); + + int pos=0; + info->pair=RALLOC(sharp_ringpair,nrings); + info->npairs=0; + *geom_info = info; + + for (int m=0; mpair[info->npairs].r1=infos[pos]; + if ((pospair[info->npairs].r2=infos[pos+1]; + ++pos; + } + else + info->pair[info->npairs].r2.nph=-1; + ++pos; + ++info->npairs; + } + DEALLOC(infos); + + qsort(info->pair,info->npairs,sizeof(sharp_ringpair),ringpair_compare); + } + +void sharp_destroy_geom_info (sharp_geom_info *geom_info) + { + DEALLOC (geom_info->pair); + DEALLOC (geom_info); + } + +static int sharp_get_mmax (int *mval, int nm) + { + int *mcheck=RALLOC(int,nm); + SET_ARRAY(mcheck,0,nm,0); + for (int i=0; i=0) && (m_curnph; + int stride = info->stride; + + ringhelper_update (self, nph, mmax, info->phi0); + self->work[0]=phase[0]; + SET_ARRAY(self->work,1,nph,0.); + +#if 0 + if (self->norot) + for (int m=1; m<=mmax; ++m) + { + int idx1 = m%nph; + int idx2 = nph-1-((m-1)%nph); + self->work[idx1]+=phase[m*pstride]; + self->work[idx2]+=conj(phase[m*pstride]); + } + else + for (int m=1; m<=mmax; ++m) + { + int idx1 = m%nph; + int idx2 = nph-1-((m-1)%nph); + dcmplx tmp = phase[m*pstride]*self->shiftarr[m]; + self->work[idx1]+=tmp; + self->work[idx2]+=conj(tmp); + } +#else + int idx1=1, idx2=nph-1; + for (int m=1; m<=mmax; ++m) + { + dcmplx tmp = phase[m*pstride]; + if(!self->norot) tmp*=self->shiftarr[m]; + self->work[idx1]+=tmp; + self->work[idx2]+=conj(tmp); + if (++idx1>=nph) idx1=0; + if (--idx2<0) idx2=nph-1; + } +#endif + real_plan_backward_c (self->plan, (double *)(self->work)); + if (fde==DOUBLE) + for (int m=0; mofs] += creal(self->work[m]); + else + for (int m=0; mofs] += (float)creal(self->work[m]); + } + +static void ringhelper_ring2phase (ringhelper *self, + const sharp_ringinfo *info, const void *data, int mmax, dcmplx *phase, + int pstride, sharp_fde fde) + { + int nph = info->nph; +#if 1 + int maxidx = mmax; /* Enable this for traditional Healpix compatibility */ +#else + int maxidx = IMIN(nph-1,mmax); +#endif + + ringhelper_update (self, nph, mmax, -info->phi0); + if (fde==DOUBLE) + for (int m=0; mwork[m] = ((double *)data)[info->ofs+m*info->stride]*info->weight; + else + for (int m=0; mwork[m] = ((float *)data)[info->ofs+m*info->stride]*info->weight; + + real_plan_forward_c (self->plan, (double *)self->work); + + if (self->norot) + for (int m=0; m<=maxidx; ++m) + phase[m*pstride] = self->work[m%nph]; + else + for (int m=0; m<=maxidx; ++m) + phase[m*pstride]=self->work[m%nph]*self->shiftarr[m]; + + for (int m=maxidx+1;m<=mmax; ++m) + phase[m*pstride]=0.; + } + +static void ringhelper_pair2phase (ringhelper *self, int mmax, + const sharp_ringpair *pair, const void *data, dcmplx *phase1, dcmplx *phase2, + int pstride, sharp_fde fde) + { + ringhelper_ring2phase (self, &(pair->r1), data, mmax, phase1, pstride, fde); + if (pair->r2.nph>0) + ringhelper_ring2phase (self, &(pair->r2), data, mmax, phase2, pstride, fde); + } + +static void ringhelper_phase2pair (ringhelper *self, int mmax, + const dcmplx *phase1, const dcmplx *phase2, int pstride, + const sharp_ringpair *pair, void *data, sharp_fde fde) + { + ringhelper_phase2ring (self, &(pair->r1), data, mmax, phase1, pstride, fde); + if (pair->r2.nph>0) + ringhelper_phase2ring (self, &(pair->r2), data, mmax, phase2, pstride, fde); + } + +static void fill_map (const sharp_geom_info *ginfo, void *map, double value, + sharp_fde fde) + { + for (int j=0;jnpairs;++j) + { + if (fde==DOUBLE) + { + for (int i=0;ipair[j].r1.nph;++i) + ((double *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=value; + for (int i=0;ipair[j].r2.nph;++i) + ((double *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=value; + } + else + { + for (int i=0;ipair[j].r1.nph;++i) + ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride] + =(float)value; + for (int i=0;ipair[j].r2.nph;++i) + ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride] + =(float)value; + } + } + } + +static void fill_alm (const sharp_alm_info *ainfo, void *alm, dcmplx value, + sharp_fde fde) + { + if (fde==DOUBLE) + for (int mi=0;minm;++mi) + for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l) + ((dcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = value; + else + for (int mi=0;minm;++mi) + for (int l=ainfo->mval[mi];l<=ainfo->lmax;++l) + ((fcmplx *)alm)[sharp_alm_index(ainfo,l,mi)] = (fcmplx)value; + } + +static void init_output (sharp_job *job) + { + if (job->add_output) return; + if (job->type == MAP2ALM) + for (int i=0; intrans*job->nalm; ++i) + fill_alm (job->ainfo,job->alm[i],0.,job->fde); + else + for (int i=0; intrans*job->nmaps; ++i) + fill_map (job->ginfo,job->map[i],0.,job->fde); + } + +static void alloc_phase (sharp_job *job, int nm, int ntheta) + { job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta); } + +static void dealloc_phase (sharp_job *job) + { DEALLOC(job->phase); } + +//FIXME: set phase to zero if not MAP2ALM? +static void map2phase (sharp_job *job, int mmax, int llim, int ulim) + { + if (job->type != MAP2ALM) return; + int pstride = 2*job->ntrans*job->nmaps; +#pragma omp parallel +{ + ringhelper helper; + ringhelper_init(&helper); +#pragma omp for schedule(dynamic,1) + for (int ith=llim; ithntrans*job->nmaps; ++i) + ringhelper_pair2phase(&helper,mmax,&job->ginfo->pair[ith], job->map[i], + &job->phase[dim2+2*i], &job->phase[dim2+2*i+1], pstride, job->fde); + } + ringhelper_destroy(&helper); +} /* end of parallel region */ + } + +static void alloc_almtmp (sharp_job *job, int lmax) + { job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); } + +static void dealloc_almtmp (sharp_job *job) + { DEALLOC(job->almtmp); } + +static void alm2almtmp (sharp_job *job, int lmax, int mi) + { + if (job->type!=MAP2ALM) + for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) + { + ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi); + double fct = (job->type==ALM2MAP) ? job->norm_l[l] : + -fabs(job->norm_l[l])*sqrt(l*(l+1.)); + for (int i=0; intrans*job->nalm; ++i) + if (job->fde==DOUBLE) + job->almtmp[job->ntrans*job->nalm*l+i] + = ((dcmplx *)job->alm[i])[aidx]*fct; + else + job->almtmp[job->ntrans*job->nalm*l+i] + = ((fcmplx *)job->alm[i])[aidx]*fct; + } + else + SET_ARRAY(job->almtmp,job->ntrans*job->nalm*job->ainfo->mval[mi], + job->ntrans*job->nalm*(lmax+1),0.); + } + +static void almtmp2alm (sharp_job *job, int lmax, int mi) + { + if (job->type != MAP2ALM) return; + for (int l=job->ainfo->mval[mi]; l<=lmax; ++l) + { + ptrdiff_t aidx = sharp_alm_index(job->ainfo,l,mi); + for (int i=0;intrans*job->nalm;++i) + if (job->fde==DOUBLE) + ((dcmplx *)job->alm[i])[aidx] += + job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l]; + else + ((fcmplx *)job->alm[i])[aidx] += + (fcmplx)(job->almtmp[job->ntrans*job->nalm*l+i]*job->norm_l[l]); + } + } + +static void phase2map (sharp_job *job, int mmax, int llim, int ulim) + { + if (job->type == MAP2ALM) return; + int pstride = 2*job->ntrans*job->nmaps; +#pragma omp parallel +{ + ringhelper helper; + ringhelper_init(&helper); +#pragma omp for schedule(dynamic,1) + for (int ith=llim; ithntrans*job->nmaps; ++i) + ringhelper_phase2pair(&helper,mmax,&job->phase[dim2+2*i], + &job->phase[dim2+2*i+1],pstride,&job->ginfo->pair[ith],job->map[i], + job->fde); + } + ringhelper_destroy(&helper); +} /* end of parallel region */ + } + +void sharp_execute_job (sharp_job *job) + { + double timer=wallTime(); + job->opcnt=0; + int lmax = job->ainfo->lmax, + mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm); + + job->norm_l = Ylmgen_get_norm (lmax, job->spin); + +/* clear output arrays if requested */ + init_output (job); + + int nchunks, chunksize; + get_chunk_info(job->ginfo->npairs,job->nv*VLEN,&nchunks,&chunksize); + alloc_phase (job,mmax+1,chunksize); + +/* chunk loop */ + for (int chunk=0; chunkginfo->npairs); + int *ispair = RALLOC(int,ulim-llim); + double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim); + idxhelper *stmp = RALLOC(idxhelper,ulim-llim); + for (int i=0; iginfo->pair[i+llim].r2.nph>0; + cth[i] = job->ginfo->pair[i+llim].r1.cth; + sth[i] = job->ginfo->pair[i+llim].r1.sth; + stmp[i].s=sth[i]; + stmp[i].i=i; + } + qsort (stmp,ulim-llim,sizeof(idxhelper),idx_compare); + int *idx = RALLOC(int,ulim-llim); + for (int i=0; iphase where necessary */ + map2phase (job, mmax, llim, ulim); + +#pragma omp parallel +{ + sharp_job ljob = *job; + ljob.opcnt=0; + Ylmgen_C generator; + Ylmgen_init (&generator,lmax,mmax,ljob.spin); + alloc_almtmp(&ljob,lmax); + +#pragma omp for schedule(dynamic,1) + for (int mi=0; miainfo->nm; ++mi) + { +/* alm->alm_tmp where necessary */ + alm2almtmp (&ljob, lmax, mi); + + inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, idx); + +/* alm_tmp->alm where necessary */ + almtmp2alm (&ljob, lmax, mi); + } + + Ylmgen_destroy(&generator); + dealloc_almtmp(&ljob); + +#pragma omp critical + job->opcnt+=ljob.opcnt; +} /* end of parallel region */ + +/* phase->map where necessary */ + phase2map (job, mmax, llim, ulim); + + DEALLOC(ispair); + DEALLOC(cth); + DEALLOC(sth); + DEALLOC(idx); + } /* end of chunk loop */ + + DEALLOC(job->norm_l); + dealloc_phase (job); + job->time=wallTime()-timer; + } + +static void sharp_build_job_common (sharp_job *job, sharp_jobtype type, int spin, + int add_output, const sharp_geom_info *geom_info, + const sharp_alm_info *alm_info, int ntrans) + { + UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms"); + UTIL_ASSERT((spin>=0)&&(spin<=30), "bad spin"); + UTIL_ASSERT((type==MAP2ALM)||(type==ALM2MAP), "unsupported SHT type"); + job->type = type; + job->spin = spin; + job->norm_l = NULL; + job->add_output = add_output; + job->nmaps = (type==ALM2MAP_DERIV1) ? 2 : ((spin>0) ? 2 : 1); + job->nalm = (type==ALM2MAP_DERIV1) ? 1 : ((spin>0) ? 2 : 1); + job->ginfo = geom_info; + job->ainfo = alm_info; + job->nv = sharp_nv_oracle (type, spin, ntrans); + job->time = 0.; + job->opcnt = 0; + job->ntrans = ntrans; + } + +void sharpd_build_job (sharp_job *job, sharp_jobtype type, int spin, + int add_output, dcmplx **alm, double **map, const sharp_geom_info *geom_info, + const sharp_alm_info *alm_info, int ntrans) + { + sharp_build_job_common (job, type, spin, add_output, geom_info, alm_info, + ntrans); + job->alm=(void **)alm; + job->map=(void **)map; + job->fde=DOUBLE; + } + +void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin, + int add_output, fcmplx **alm, float **map, const sharp_geom_info *geom_info, + const sharp_alm_info *alm_info, int ntrans) + { + sharp_build_job_common (job, type, spin, add_output, geom_info, alm_info, + ntrans); + job->alm=(void **)alm; + job->map=(void **)map; + job->fde=FLOAT; + } + +int sharp_get_nv_max (void) +{ return 6; } + +int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans) + { + UTIL_ASSERT(type!=ALM2MAP_DERIV1,"transform type not yet supported"); + +#include "oracle.inc" + + return nv_opt[IMIN(ntrans,maxtr)-1][spin!=0][type]; + } + +#include "sharp_mpi.c" diff --git a/libsharp/sharp.h b/libsharp/sharp.h new file mode 100644 index 0000000..590da0b --- /dev/null +++ b/libsharp/sharp.h @@ -0,0 +1,213 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp.h + * Interface for the spherical transform library. + * + * Copyright (C) 2006-2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_SHARP_H +#define PLANCK_SHARP_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \internal + Helper type containing information about a single ring. */ +typedef struct + { + double theta, phi0, weight, cth, sth; + ptrdiff_t ofs; + int nph, stride; + } sharp_ringinfo; + +/*! \internal + Helper type containing information about a pair of rings with colatitudes + symmetric around the equator. */ +typedef struct + { + sharp_ringinfo r1,r2; + } sharp_ringpair; + +/*! \internal + Type holding all required information about a map geometry. */ +typedef struct + { + sharp_ringpair *pair; + int npairs; + } sharp_geom_info; + +/*! \defgroup almgroup Helpers for dealing with a_lm */ +/*! \{ */ + +/*! \internal + Helper type for index calculation in a_lm arrays. */ +typedef struct + { + /*! Maximum \a l index of the array */ + int lmax; + /*! Number of different \a m values in this object */ + int nm; + /*! Array with \a nm entries containing the individual m values */ + int *mval; + /*! Array with \a nm entries containing the (hypothetical) indices of + the coefficients with quantum numbers 0,\a mval[i] */ + ptrdiff_t *mvstart; + /*! Stride between a_lm and a_(l+1),m */ + ptrdiff_t stride; + } sharp_alm_info; + +/*! Creates an Alm data structure information from the following parameters: + \param lmax maximum \a l quantum number (>=0) + \param mmax maximum \a m quantum number (0<= \a mmax <= \a lmax) + \param stride the stride between consecutive a_lm entries + \param mstart the index of the (hypothetical) coefficient with the + quantum numbers 0,\a m. Must have \a mmax+1 entries. + \param alm_info will hold a pointer to the newly created data structure + */ +void sharp_make_alm_info (int lmax, int mmax, int stride, + const ptrdiff_t *mstart, sharp_alm_info **alm_info); +/*! Creates an Alm data structure information from the following parameters: + \param lmax maximum \a l quantum number (>=0) + \param nm number of different \a m (<=\a lmax+1) + \param stride the stride between consecutive a_lm entries + \param mval array with \a nm entries containing the individual m values + \param mvstart array with \a nm entries containing the (hypothetical) + indices of the coefficients with the quantum numbers 0,\a mval[i] + \param alm_info will hold a pointer to the newly created data structure + */ +void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval, + const ptrdiff_t *mvstart, sharp_alm_info **alm_info); +/*! Returns the index of the coefficient with quantum numbers \a l, + \a mval[mi]. */ +ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi); +/*! Deallocates the a_lm info object. */ +void sharp_destroy_alm_info (sharp_alm_info *info); + +/*! \} */ + +/*! \defgroup geominfogroup Functions for dealing with geometry information */ +/*! \{ */ + +/*! Creates a geometry information from a set of ring descriptions. + All arrays passed to this function must have \a nrings elements. + \param nrings the number of rings in the map + \param nph the number of pixels in each ring + \param ofs the index of the first pixel in each ring in the map array + \param stride the stride between consecutive pixels + \param phi0 the azimuth (in radians) of the first pixel in each ring + \param theta the colatitude (in radians) of each ring + \param weight the pixel weight to be used for the ring + \param geom_info will hold a pointer to the newly created data structure + */ +void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs, + const int *stride, const double *phi0, const double *theta, + const double *weight, sharp_geom_info **geom_info); + +/*! Deallocates the geometry information in \a info. */ +void sharp_destroy_geom_info (sharp_geom_info *info); + +/*! \} */ + +/*! \defgroup jobgroup Functionality for defining and executing SHTs */ +/*! \{ */ + +/*! Enumeration of SHARP job types. */ +typedef enum { MAP2ALM, /*!< analysis */ + ALM2MAP, /*!< synthesis */ + ALM2MAP_DERIV1 /*!< currently unused */ + } sharp_jobtype; + +typedef enum { FLOAT, DOUBLE } sharp_fde; + +/*! \internal + Type holding all required information about an SHT job. */ +typedef struct + { + sharp_jobtype type; + int spin; + int add_output; + int nmaps, nalm; + sharp_fde fde; + void **map; + void **alm; + complex double *phase; + double *norm_l; + complex double *almtmp; + const sharp_geom_info *ginfo; + const sharp_alm_info *ainfo; + int nv; + double time; + int ntrans; + unsigned long long opcnt; + } sharp_job; + +/*! Initializes \a job with the appropriate parameters to perform the required + SHT. + \param type the type of SHT (currently ALM2MAP and MAP2ALM) + \param spin the spin of the quantities to be transformed + \param add_output if 0, the output arrays will be overwritten, + else the result will be added to the output arrays. + \param ntrans the number of simultaneous SHTs + \param alm contains pointers to the a_lm coefficients. If \a spin==0, + alm[0] points to the a_lm of the first SHT, alm[1] to those of the second + etc. If \a spin>0, alm[0] and alm[1] point to the a_lm of the first SHT, + alm[2] and alm[3] to those of the second, etc. + \param map contains pointers to the maps. If \a spin==0, + map[0] points to the map of the first SHT, map[1] to that of the second + etc. If \a spin>0, map[0] and map[1] point to the maps of the first SHT, + map[2] and map[3] to those of the second, etc. + \note \a map and \a a_lm must not be de-allocated until after the last call of + sharp_execute_job()! This is because the library does not copy the input + data, but only stores the pointers to the supplied maps and a_lm. */ +void sharpd_build_job (sharp_job *job, sharp_jobtype type, int spin, + int add_output, complex double **alm, double **map, + const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans); + +void sharps_build_job (sharp_job *job, sharp_jobtype type, int spin, + int add_output, complex float **alm, float **map, + const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans); + +/*! Execute the SHT job previously constructed by sharpd_build_job() or + sharps_build_job(). */ +void sharp_execute_job (sharp_job *job); + +/*! \} */ + +/*! Internal */ +int sharp_get_nv_max (void); +/*! Internal */ +int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_acctest.c b/libsharp/sharp_acctest.c new file mode 100644 index 0000000..3f36877 --- /dev/null +++ b/libsharp/sharp_acctest.c @@ -0,0 +1,217 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_acctest.c + Systematic accuracy test for libsharp. + + Copyright (C) 2006-2012 Max-Planck-Society + \author Martin Reinecke +*/ + +#include +#include +#ifdef USE_MPI +#include "mpi.h" +#endif +#include "sharp.h" +#include "sharp_geomhelpers.h" +#include "sharp_almhelpers.h" +#include "c_utils.h" +#include "sharp_core.h" + +typedef complex double dcmplx; + +static double drand (double min, double max) + { return min + (max-min)*rand()/(RAND_MAX+1.0); } + +static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin) + { + for (int mi=0;minm; ++mi) + { + int m=helper->mval[mi]; + for (int l=m;l<=helper->lmax; ++l) + { + if ((lmaxdiff) maxdiff=fabs(x); + if (fabs(y)>maxdiff) maxdiff=fabs(y); + } + sum=sqrt(sum/nalms); + sum2=sqrt(sum2/nalms); + UTIL_ASSERT((maxdiff<1e-10)&&(sum/sum2<1e-10),"error"); + } + } + +static void check_sign_scale(void) + { + int lmax=50; + int mmax=lmax; + sharp_geom_info *tinfo; + int nrings=lmax+1; + int ppring=2*lmax+2; + ptrdiff_t npix=(ptrdiff_t)nrings*ppring; + sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo); + + sharp_alm_info *alms; + sharp_make_triangular_alm_info(lmax,mmax,1,&alms); + ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax); + + double **map; + ALLOC2D(map,double,2,npix); + + dcmplx **alm; + ALLOC2D(alm,dcmplx,2,nalms); + for (int i=0; i<2; ++i) + for (int j=0; jlmax = lmax; + info->nm = mmax+1; + info->mval = RALLOC(int,mmax+1); + info->mvstart = RALLOC(ptrdiff_t,mmax+1); + info->stride = stride; + int tval = 2*lmax+1; + for (ptrdiff_t m=0; m<=mmax; ++m) + { + info->mval[m] = m; + info->mvstart[m] = stride*((m*(tval-m))>>1); + } + *alm_info = info; + } + +void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride, + sharp_alm_info **alm_info) + { + sharp_alm_info *info = RALLOC(sharp_alm_info,1); + info->lmax = lmax; + info->nm = mmax+1; + info->mval = RALLOC(int,mmax+1); + info->mvstart = RALLOC(ptrdiff_t,mmax+1); + info->stride = stride; + for (ptrdiff_t m=0; m<=mmax; ++m) + { + info->mval[m] = m; + info->mvstart[m] = stride*m*(lmax+1); + } + *alm_info = info; + } diff --git a/libsharp/sharp_almhelpers.h b/libsharp/sharp_almhelpers.h new file mode 100644 index 0000000..c6cb35a --- /dev/null +++ b/libsharp/sharp_almhelpers.h @@ -0,0 +1,57 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_almhelpers.h + * SHARP helper function for the creation of a_lm data structures + * + * Copyright (C) 2008-2011 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_SHARP_ALMHELPERS_H +#define PLANCK_SHARP_ALMHELPERS_H + +#include "sharp.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! Initialises an a_lm data structure according to the scheme used by + Healpix_cxx. + \ingroup almgroup */ +void sharp_make_triangular_alm_info (int lmax, int mmax, int stride, + sharp_alm_info **alm_info); + +/*! Initialises an a_lm data structure according to the scheme used by + Fortran Healpix + \ingroup almgroup */ +void sharp_make_rectangular_alm_info (int lmax, int mmax, int stride, + sharp_alm_info **alm_info); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_bench.c b/libsharp/sharp_bench.c new file mode 100644 index 0000000..185dc9e --- /dev/null +++ b/libsharp/sharp_bench.c @@ -0,0 +1,143 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_bench.c + Copyright (C) 2012 Max-Planck-Society + \author Martin Reinecke +*/ + +#include +#include +#ifdef USE_MPI +#include "mpi.h" +#endif +#include "sharp.h" +#include "sharp_geomhelpers.h" +#include "sharp_almhelpers.h" +#include "c_utils.h" +#include "sharp_core.h" + +typedef complex double dcmplx; + +static void bench_sht (int spin, int nv, sharp_jobtype type, + int ntrans, double *time, unsigned long long *opcnt) + { + int lmax=2047; + int mmax=128; + int nrings=512; + int ppring=1024; + ptrdiff_t npix=(ptrdiff_t)nrings*ppring; + sharp_geom_info *tinfo; + sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo); + + ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax); + int ncomp = ntrans*((spin==0) ? 1 : 2); + + double **map; + ALLOC2D(map,double,ncomp,npix); + SET_ARRAY(map[0],0,npix*ncomp,0.); + + sharp_alm_info *alms; + sharp_make_triangular_alm_info(lmax,mmax,1,&alms); + + dcmplx **alm; + ALLOC2D(alm,dcmplx,ncomp,nalms); + SET_ARRAY(alm[0],0,nalms*ncomp,0.); + + int nruns=0; + sharp_job job; + sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans); + job.nv=nv; + *time=1e30; + *opcnt=1000000000000000; + do + { + sharpd_build_job(&job,type,spin,0,&alm[0],&map[0],tinfo,alms,ntrans); + job.nv=nv; + sharp_execute_job(&job); + + if (job.opcnt<*opcnt) *opcnt=job.opcnt; + if (job.time<*time) *time=job.time; + } + while (++nruns < 4); + + DEALLOC2D(map); + DEALLOC2D(alm); + + sharp_destroy_alm_info(alms); + sharp_destroy_geom_info(tinfo); + } + +int main(void) + { +#ifdef USE_MPI + MPI_Init(NULL,NULL); +#endif + module_startup_c("sharp_bench",1,1,"",1); + + printf("Benchmarking SHTs.\n\n"); + FILE *fp=fopen("oracle.inc","w"); + UTIL_ASSERT(fp, "failed to open oracle file for writing"); + fprintf(fp,"static const int maxtr = 6;\n"); + fprintf(fp,"static const int nv_opt[6][2][3] = {\n"); + + for (int ntr=1; ntr<=6; ++ntr) + { + fprintf(fp,"{"); + for (int spin=0; spin<=2; spin+=2) + { + fprintf(fp,"{"); + for (sharp_jobtype type=MAP2ALM; type<=ALM2MAP; ++type) + { + int nvbest=-1, nvoracle=sharp_nv_oracle(type,spin,ntr); + unsigned long long opmin=1000000000000000, op; + double tmin=1e30; + double *time=RALLOC(double,sharp_get_nv_max()+1); + for (int nv=1; nv<=sharp_get_nv_max(); ++nv) + { + bench_sht (spin,nv,type,ntr,&time[nv],&op); + if (op +#include +#include +#include "vecsupport.h" +#include "complex_hacks.h" +#include "ylmgen_c.h" +#include "sharp.h" +#include "sharp_core.h" +#include "c_utils.h" + +typedef complex double dcmplx; + +#define MAXJOB_SPECIAL 2 + +#define XCONCAT2(a,b) a##_##b +#define CONCAT2(a,b) XCONCAT2(a,b) +#define XCONCAT3(a,b,c) a##_##b##_##c +#define CONCAT3(a,b,c) XCONCAT3(a,b,c) + +#define nvec 1 +#include "sharp_inchelper1.inc.c" +#undef nvec + +#define nvec 2 +#include "sharp_inchelper1.inc.c" +#undef nvec + +#define nvec 3 +#include "sharp_inchelper1.inc.c" +#undef nvec + +#define nvec 4 +#include "sharp_inchelper1.inc.c" +#undef nvec + +#define nvec 5 +#include "sharp_inchelper1.inc.c" +#undef nvec + +#define nvec 6 +#include "sharp_inchelper1.inc.c" +#undef nvec + +void inner_loop (sharp_job *job, const int *ispair,const double *cth, + const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx) + { + int njobs=job->ntrans; + if (njobs<=MAXJOB_SPECIAL) + { + switch (njobs*16+job->nv) + { +#if (MAXJOB_SPECIAL>=1) + case 0x11: + CONCAT3(inner_loop,1,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x12: + CONCAT3(inner_loop,2,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x13: + CONCAT3(inner_loop,3,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x14: + CONCAT3(inner_loop,4,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x15: + CONCAT3(inner_loop,5,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x16: + CONCAT3(inner_loop,6,1) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; +#endif +#if (MAXJOB_SPECIAL>=2) + case 0x21: + CONCAT3(inner_loop,1,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x22: + CONCAT3(inner_loop,2,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x23: + CONCAT3(inner_loop,3,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x24: + CONCAT3(inner_loop,4,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x25: + CONCAT3(inner_loop,5,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x26: + CONCAT3(inner_loop,6,2) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; +#endif +#if (MAXJOB_SPECIAL>=3) + case 0x31: + CONCAT3(inner_loop,1,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x32: + CONCAT3(inner_loop,2,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x33: + CONCAT3(inner_loop,3,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x34: + CONCAT3(inner_loop,4,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x35: + CONCAT3(inner_loop,5,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x36: + CONCAT3(inner_loop,6,3) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; +#endif +#if (MAXJOB_SPECIAL>=4) + case 0x41: + CONCAT3(inner_loop,1,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x42: + CONCAT3(inner_loop,2,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x43: + CONCAT3(inner_loop,3,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x44: + CONCAT3(inner_loop,4,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x45: + CONCAT3(inner_loop,5,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x46: + CONCAT3(inner_loop,6,4) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; +#endif +#if (MAXJOB_SPECIAL>=5) + case 0x51: + CONCAT3(inner_loop,1,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x52: + CONCAT3(inner_loop,2,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x53: + CONCAT3(inner_loop,3,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x54: + CONCAT3(inner_loop,4,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x55: + CONCAT3(inner_loop,5,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x56: + CONCAT3(inner_loop,6,5) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; +#endif +#if (MAXJOB_SPECIAL>=6) + case 0x61: + CONCAT3(inner_loop,1,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x62: + CONCAT3(inner_loop,2,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x63: + CONCAT3(inner_loop,3,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x64: + CONCAT3(inner_loop,4,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x65: + CONCAT3(inner_loop,5,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; + case 0x66: + CONCAT3(inner_loop,6,6) (job, ispair,cth,sth,llim,ulim,gen,mi,idx); + return; +#endif + } + } +#if (MAXJOB_SPECIAL<6) + else + { + switch (job->nv) + { + case 1: + CONCAT2(inner_loop,1) + (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + return; + case 2: + CONCAT2(inner_loop,2) + (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + return; + case 3: + CONCAT2(inner_loop,3) + (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + return; + case 4: + CONCAT2(inner_loop,4) + (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + return; + case 5: + CONCAT2(inner_loop,5) + (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + return; + case 6: + CONCAT2(inner_loop,6) + (job, ispair,cth,sth,llim,ulim,gen,mi,idx,job->ntrans); + return; + } + } +#endif + UTIL_FAIL("Incorrect vector parameters"); + } diff --git a/libsharp/sharp_core.h b/libsharp/sharp_core.h new file mode 100644 index 0000000..0699074 --- /dev/null +++ b/libsharp/sharp_core.h @@ -0,0 +1,49 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_core.h + * Interface for the computational core + * + * Copyright (C) 2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_SHARP_CORE_H +#define PLANCK_SHARP_CORE_H + +#include "sharp.h" +#include "ylmgen_c.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void inner_loop (sharp_job *job, const int *ispair,const double *cth, + const double *sth, int llim, int ulim, Ylmgen_C *gen, int mi, const int *idx); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_core_inc.c b/libsharp/sharp_core_inc.c new file mode 100644 index 0000000..b892f53 --- /dev/null +++ b/libsharp/sharp_core_inc.c @@ -0,0 +1,268 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_core_inc.c + * Type-dependent code for the computational core + * + * Copyright (C) 2012 Max-Planck-Society + * \author Martin Reinecke + */ + +typedef struct + { Tv v[nvec]; } Tb; + +typedef union + { Tb b; double s[VLEN*nvec]; } Y(Tbu); + +typedef struct + { Tb r, i; } Y(Tbri); + +typedef struct + { Tb qr, qi, ur, ui; } Y(Tbqu); + +typedef struct + { double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri); + +typedef struct + { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu); + +typedef union + { Y(Tbri) b; Y(Tsri)s; } Y(Tburi); + +typedef union + { Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu); + +static inline Tb Y(Tbconst)(double val) + { + Tv v=vload(val); + Tb res; + for (int i=0; iv[i],v); } + +static inline Tb Y(Tbprod)(Tb a, Tb b) + { Tb r; for (int i=0; iv[i],b.v[i]); } + +static inline void Y(mypow) (Tb val, int npow, Tb * restrict resd, + Tb * restrict ress) + { + Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.); + + do + { + if (npow&1) + { + for (int i=0; i>=1); + + *resd=res; + *ress=scale; + } + +static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2, + Tb * restrict scale) + { + int did_scale=0; + for (int i=0;iv[i]),vone); + if (vanyTrue(mask)) + { + did_scale=1; + Tv fact = vblend(mask,vload(fsmall),vone); + vmuleq(lam1->v[i],fact); vmuleq(lam2->v[i],fact); + vaddeq(scale->v[i],vblend(mask,vone,vzero)); + } + } + return did_scale; + } + +static inline void Y(normalize) (Tb * restrict val, Tb * restrict scale) + { + const Tv vfsmall=vload(fsmall), vfbig=vload(fbig); + for (int i=0;iv[i]),vone); + while (vanyTrue(mask)) + { + vmuleq(val->v[i],vblend(mask,vfsmall,vone)); + vaddeq(scale->v[i],vblend(mask,vone,vzero)); + mask = vgt(vabs(val->v[i]),vone); + } + mask = vlt(vabs(val->v[i]),vfsmall); + mask = vand(mask,vne(val->v[i],vzero)); + while (vanyTrue(mask)) + { + vmuleq(val->v[i],vblend(mask,vfbig,vone)); + vsubeq(scale->v[i],vblend(mask,vone,vzero)); + mask = vlt(vabs(val->v[i]),vfsmall); + mask = vand(mask,vne(val->v[i],vzero)); + } + } + } + +static inline int Y(TballLt)(Tb a,double b) + { + Tv vb=vload(b); + Tv res=vlt(a.v[0],vb); + for (int i=1; im; + Tb lam_1=Y(Tbconst)(0.), lam_2, scale; + Y(mypow) (sth,l,&lam_2,&scale); + Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]); + Y(normalize)(&lam_2,&scale); + + int below_limit = Y(TballLt)(scale,limscale); + while (below_limit) + { + if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} + Tv r0=vload(gen->rf[l].f[0]),r1=vload(gen->rf[l].f[1]); + for (int i=0; irf[l+1].f[0]); r1=vload(gen->rf[l+1].f[1]); + for (int i=0; iv[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,ryp->v[i])), + vmul(fx2,rxp->v[i])); + rxm->v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rym->v[i])), + vmul(fx2,rxm->v[i])); + } + } + +static void Y(iter_to_ieee_spin) (const Tb cth, int *l_, + Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_, + Tb * scalep_, Tb * scalem_, const Ylmgen_C * restrict gen) + { + const ylmgen_dbl3 * restrict fx = gen->fx; + Tb cth2, sth2; + for (int i=0; icosPow,&ccp,&ccps); Y(mypow)(sth2,gen->sinPow,&ssp,&ssps); + Y(mypow)(cth2,gen->sinPow,&csp,&csps); Y(mypow)(sth2,gen->cosPow,&scp,&scps); + + Tb rec2p, rec2m, scalep, scalem; + Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.); + Tv prefac=vload(gen->prefac[gen->m]), + prescale=vload(gen->fscale[gen->m]); + for (int i=0; ipreMinus_p) + rec2p.v[i]=vneg(rec2p.v[i]); + if (gen->preMinus_m) + rec2m.v[i]=vneg(rec2m.v[i]); + if (gen->s&1) + rec2p.v[i]=vneg(rec2p.v[i]); + } + Y(normalize)(&rec2m,&scalem); Y(normalize)(&rec2p,&scalep); + + int l=gen->mhi; + + int below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale); + while (below_limit) + { + if (l+2>gen->lmax) {*l_=gen->lmax+1;return;} + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]); + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + below_limit = Y(TballLt)(scalep,limscale) && Y(TballLt)(scalem,limscale); + l+=2; + } + + *l_=l; + *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep; + *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem; + } diff --git a/libsharp/sharp_core_inc2.c b/libsharp/sharp_core_inc2.c new file mode 100644 index 0000000..e42e6de --- /dev/null +++ b/libsharp/sharp_core_inc2.c @@ -0,0 +1,702 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_core_inc2.c + * Type-dependent code for the computational core + * + * Copyright (C) 2012 Max-Planck-Society + * \author Martin Reinecke + */ + +typedef struct + { Y(Tbri) j[njobs]; } Z(Tbrij); +typedef union + { Z(Tbrij) b; Y(Tsri) j[njobs]; } Z(Tburij); +typedef struct + { Y(Tbqu) j[njobs]; } Z(Tbquj); +typedef union + { Z(Tbquj) b; Y(Tsqu) j[njobs]; } Z(Tbuquj); + +static void Z(alm2map_kernel) (const Tb cth, Z(Tbrij) * restrict p1, + Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2, + const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, + int l, int lmax) + { +#if (njobs>1) + while (lj[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4); + vfmaaeq(p1->j[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4); + } + Tv ar3=vload(creal(alm[njobs*(l+1)+j])), + ai3=vload(cimag(alm[njobs*(l+1)+j])), + ar1=vload(creal(alm[njobs*(l+3)+j])), + ai1=vload(cimag(alm[njobs*(l+3)+j])); + for (int i=0; ij[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1); + vfmaaeq(p2->j[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1); + } + } + r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]); + for (int i=0; ij[j].r.v[i],lam_2.v[i],ar); + vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai); + } + ar=vload(creal(alm[njobs*(l+1)+j])); + ai=vload(cimag(alm[njobs*(l+1)+j])); + for (int i=0; ij[j].r.v[i],lam_1.v[i],ar); + vfmaeq(p2->j[j].i.v[i],lam_1.v[i],ai); + } + } + r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]); + for (int i=0; ij[j].r.v[i],lam_2.v[i],ar); + vfmaeq(p1->j[j].i.v[i],lam_2.v[i],ai); + } + } + } + } + +static void Z(map2alm_kernel) (const Tb cth, const Z(Tbrij) * restrict p1, + const Z(Tbrij) * restrict p2, Tb lam_1, Tb lam_2, + const ylmgen_dbl2 * restrict rf, dcmplx * restrict alm, int l, int lmax) + { + while (lj[j].r.v[i]); + vfmaeq(ti1,lam_2.v[i],p1->j[j].i.v[i]); + } + for (int i=0; ij[j].r.v[i]); + vfmaeq(ti2,lam_1.v[i],p2->j[j].i.v[i]); + } + vhsum_cmplx2(tr1,ti1,tr2,ti2,&alm[l*njobs+j],&alm[(l+1)*njobs+j]); + } + r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]); + for (int i=0; ij[j].r.v[i]); + vfmaeq(tim,lam_2.v[i],p1->j[j].i.v[i]); + } + alm[l*njobs+j]+=vhsum_cmplx(tre,tim); + } + } + } + +static void Z(calc_alm2map) (const Tb cth, const Tb sth, const Ylmgen_C *gen, + sharp_job *job, Z(Tbrij) * restrict p1, Z(Tbrij) * restrict p2, int *done) + { + int l,lmax=gen->lmax; + Tb lam_1,lam_2,scale; + Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) { *done=1; return; } + job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; + + Tb corfac; + Y(getCorfac)(scale,&corfac,gen->cf); + const ylmgen_dbl2 * restrict rf = gen->rf; + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scale,minscale); + while (!full_ieee) + { + for (int j=0; jj[j].r.v[i],tmp,ar); + vfmaeq(p1->j[j].i.v[i],tmp,ai); + } + } + if (++l>lmax) break; + Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); + for (int i=0; ij[j].r.v[i],tmp,ar); + vfmaeq(p2->j[j].i.v[i],tmp,ai); + } + } + if (++l>lmax) break; + r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); + for (int i=0; icf); + full_ieee = Y(TballGt)(scale,minscale); + } + } + if (l>lmax) { *done=1; return; } + + Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); + Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); + } + +static void Z(calc_map2alm) (const Tb cth, const Tb sth, + const Ylmgen_C *gen, sharp_job *job, const Z(Tbrij) * restrict p1, + const Z(Tbrij) * restrict p2, int *done) + { + int lmax=gen->lmax; + Tb lam_1,lam_2,scale; + int l=gen->m; + Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) { *done=1; return; } + job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; + + const ylmgen_dbl2 * restrict rf = gen->rf; + Tb corfac; + Y(getCorfac)(scale,&corfac,gen->cf); + dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scale,minscale); + while (!full_ieee) + { + for (int j=0; jj[j].r.v[i]); + vfmaeq(tim,tmp,p1->j[j].i.v[i]); + } + alm[l*njobs+j]+=vhsum_cmplx(tre,tim); + } + if (++l>lmax) { *done=1; return; } + Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); + for (int i=0; ij[j].r.v[i]); + vfmaeq(tim,tmp,p2->j[j].i.v[i]); + } + alm[l*njobs+j]+=vhsum_cmplx(tre,tim); + } + if (++l>lmax) { *done=1; return; } + r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); + for (int i=0; icf); + full_ieee = Y(TballGt)(scale,minscale); + } + } + + Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); + Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax); + } + +static inline void Z(saddstep) (Z(Tbquj) * restrict px, Z(Tbquj) * restrict py, + const Tb rxp, const Tb rxm, const dcmplx * restrict alm) + { + for (int j=0; jj[j].qr.v[i],agr,lw); + vfmaeq(px->j[j].qi.v[i],agi,lw); + vfmaeq(px->j[j].ur.v[i],acr,lw); + vfmaeq(px->j[j].ui.v[i],aci,lw); + } + for (int i=0; ij[j].qr.v[i],aci,lx); + vfmaeq(py->j[j].qi.v[i],acr,lx); + vfmaeq(py->j[j].ur.v[i],agi,lx); + vfmseq(py->j[j].ui.v[i],agr,lx); + } + } + } + +static inline void Z(saddstepb) (Z(Tbquj) * restrict p1, Z(Tbquj) * restrict p2, + const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m, + const dcmplx * restrict alm1, const dcmplx * restrict alm2) + { + for (int j=0; jj[j].qr.v[i],agr1,lw1,aci2,lx2); + vfmaaeq(p1->j[j].qi.v[i],agi1,lw1,acr2,lx2); + vfmaaeq(p1->j[j].ur.v[i],acr1,lw1,agi2,lx2); + vfmaseq(p1->j[j].ui.v[i],aci1,lw1,agr2,lx2); + } + for (int i=0; ij[j].qr.v[i],agr2,lw2,aci1,lx1); + vfmaaeq(p2->j[j].qi.v[i],agi2,lw2,acr1,lx1); + vfmaaeq(p2->j[j].ur.v[i],acr2,lw2,agi1,lx1); + vfmaseq(p2->j[j].ui.v[i],aci2,lw2,agr1,lx1); + } + } + } + +static inline void Z(saddstep2) (const Z(Tbquj) * restrict px, + const Z(Tbquj) * restrict py, const Tb * restrict rxp, + const Tb * restrict rxm, dcmplx * restrict alm) + { + for (int j=0; jv[i],rxm->v[i]); + vfmaeq(agr,px->j[j].qr.v[i],lw); + vfmaeq(agi,px->j[j].qi.v[i],lw); + vfmaeq(acr,px->j[j].ur.v[i],lw); + vfmaeq(aci,px->j[j].ui.v[i],lw); + } + for (int i=0; iv[i],rxp->v[i]); + vfmseq(agr,py->j[j].ui.v[i],lx); + vfmaeq(agi,py->j[j].ur.v[i],lx); + vfmaeq(acr,py->j[j].qi.v[i],lx); + vfmseq(aci,py->j[j].qr.v[i],lx); + } + vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]); + } + } + +static void Z(alm2map_spin_kernel) (Tb cth, Z(Tbquj) * restrict p1, + Z(Tbquj) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, + const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, int lmax) + { + while (l1) + Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l], + &alm[2*njobs*(l+1)]); +#else + Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l]); + Z(saddstep)(p2, p1, rec1p, rec1m, &alm[2*njobs*(l+1)]); +#endif + fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]); + fx2=vload(fx[l+2].f[2]); + for (int i=0; ilmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) + { *done=1; return; } + job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; + + const ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + while (!full_ieee) + { + Z(saddstep)(p1, p2, + Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), &alm[2*njobs*l]); + if (++l>lmax) break; + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + Z(saddstep)(p2, p1, + Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), &alm[2*njobs*l]); + if (++l>lmax) break; + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + } + } + + if (l>lmax) + { *done=1; return; } + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Z(alm2map_spin_kernel) (cth,p1,p2, + rec1p, rec1m, rec2p, rec2m, fx, alm, l, lmax); + } + +static void Z(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen, + sharp_job *job, const Z(Tbquj) * restrict p1, const Z(Tbquj) * restrict p2, + int *done) + { + int l, lmax=gen->lmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) { *done=1; return; } + job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; + + const ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + while (!full_ieee) + { + Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); + Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l]); + if (++l>lmax) { *done=1; return; } + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); + Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l]); + if (++l>lmax) { *done=1; return; } + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + } + } + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Z(map2alm_spin_kernel) (cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax); + } + +#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) + +static void Z(inner_loop) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen, + int mi, const int *idx) + { + const int nval=nvec*VLEN; + const int m = job->ainfo->mval[mi]; + Ylmgen_prepare (gen, m); + + switch (job->type) + { + case ALM2MAP: + { + if (job->spin==0) + { + int done=0; + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + } + Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1.b,&p2.b,&done); + } + + for (int i=0; iainfo->nm+mi)); + complex double r1 = p1.j[j].r[i] + p1.j[j].i[i]*_Complex_I, + r2 = p2.j[j].r[i] + p2.j[j].i[i]*_Complex_I; + job->phase[phas_idx] = r1+r2; + if (ispair[itot]) + job->phase[phas_idx+1] = r1-r2; + } + } + } + } + } + else + { + int done=0; + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; + } + Z(calc_alm2map_spin) (cth.b,gen,job,&p1.b,&p2.b,&done); + } + + for (int i=0; iainfo->nm+mi)); + complex double q1 = p1.j[j].qr[i] + p1.j[j].qi[i]*_Complex_I, + q2 = p2.j[j].qr[i] + p2.j[j].qi[i]*_Complex_I, + u1 = p1.j[j].ur[i] + p1.j[j].ui[i]*_Complex_I, + u2 = p2.j[j].ur[i] + p2.j[j].ui[i]*_Complex_I; + job->phase[phas_idx] = q1+q2; + job->phase[phas_idx+2] = u1+u2; + if (ispair[itot]) + { + dcmplx *phQ = &(job->phase[phas_idx+1]), + *phU = &(job->phase[phas_idx+3]); + *phQ = q1-q2; + *phU = u1-u2; + if ((gen->mhi-gen->m+gen->s)&1) + { *phQ=-(*phQ); *phU=-(*phU); } + } + } + } + } + } + } + break; + } + case ALM2MAP_DERIV1: + break; + case MAP2ALM: + { + if (job->spin==0) + { + int done=0; + for (int ith=0; (ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + if (i+ithainfo->nm+mi)); + dcmplx ph1=job->phase[phas_idx]; + dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; + p1.j[j].r[i]=creal(ph1+ph2); p1.j[j].i[i]=cimag(ph1+ph2); + p2.j[j].r[i]=creal(ph1-ph2); p2.j[j].i[i]=cimag(ph1-ph2); + } + } + } + Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1.b,&p2.b,&done); + } + } + else + { + int done=0; + for (int ith=0; (ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; + if (i+ithainfo->nm+mi)); + dcmplx p1Q=job->phase[phas_idx], + p1U=job->phase[phas_idx+2], + p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., + p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; + if ((gen->mhi-gen->m+gen->s)&1) + { p2Q=-p2Q; p2U=-p2U; } + p1.j[j].qr[i]=creal(p1Q+p2Q); p1.j[j].qi[i]=cimag(p1Q+p2Q); + p1.j[j].ur[i]=creal(p1U+p2U); p1.j[j].ui[i]=cimag(p1U+p2U); + p2.j[j].qr[i]=creal(p1Q-p2Q); p2.j[j].qi[i]=cimag(p1Q-p2Q); + p2.j[j].ur[i]=creal(p1U-p2U); p2.j[j].ui[i]=cimag(p1U-p2U); + } + } + } + Z(calc_map2alm_spin) (cth.b,gen,job,&p1.b,&p2.b,&done); + } + } + break; + } + } + } + +#undef VZERO diff --git a/libsharp/sharp_core_inc3.c b/libsharp/sharp_core_inc3.c new file mode 100644 index 0000000..9fecec4 --- /dev/null +++ b/libsharp/sharp_core_inc3.c @@ -0,0 +1,691 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_core_inc3.c + * Type-dependent code for the computational core + * + * Copyright (C) 2012 Max-Planck-Society + * \author Martin Reinecke + */ + +static void Y(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1, + Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2, + const ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm, + int l, int lmax, int njobs) + { + while (llmax; + Tb lam_1,lam_2,scale; + Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) { *done=1; return; } + job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; + + Tb corfac; + Y(getCorfac)(scale,&corfac,gen->cf); + const ylmgen_dbl2 * restrict rf = gen->rf; + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scale,minscale); + while (!full_ieee) + { + for (int j=0; jlmax) break; + Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); + for (int i=0; ilmax) break; + r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); + for (int i=0; icf); + full_ieee = Y(TballGt)(scale,minscale); + } + } + if (l>lmax) { *done=1; return; } + + Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); + Y(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs); + } + +static void Y(calc_map2alm) (const Tb cth, const Tb sth, + const Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1, + const Y(Tbri) * restrict p2, int njobs, int *done) + { + int lmax=gen->lmax; + Tb lam_1,lam_2,scale; + int l=gen->m; + Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen); + job->opcnt += (l-gen->m) * 4*VLEN*nvec; + if (l>lmax) { *done=1; return; } + job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec; + + const ylmgen_dbl2 * restrict rf = gen->rf; + Tb corfac; + Y(getCorfac)(scale,&corfac,gen->cf); + dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scale,minscale); + while (!full_ieee) + { + for (int j=0; jlmax) { *done=1; return; } + Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]); + for (int i=0; ilmax) { *done=1; return; } + r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]); + for (int i=0; icf); + full_ieee = Y(TballGt)(scale,minscale); + } + } + + Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac); + Y(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax, njobs); + } + +static inline void Y(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py, + const Tb rxp, const Tb rxm, const dcmplx * restrict alm, int njobs) + { + for (int j=0; jv[i],rxm->v[i]); + vfmaeq(agr,px[j].qr.v[i],lw); + vfmaeq(agi,px[j].qi.v[i],lw); + vfmaeq(acr,px[j].ur.v[i],lw); + vfmaeq(aci,px[j].ui.v[i],lw); + } + for (int i=0; iv[i],rxp->v[i]); + vfmseq(agr,py[j].ui.v[i],lx); + vfmaeq(agi,py[j].ur.v[i],lx); + vfmaeq(acr,py[j].qi.v[i],lx); + vfmseq(aci,py[j].qr.v[i],lx); + } + vhsum_cmplx2(agr,agi,acr,aci,&alm[2*j],&alm[2*j+1]); + } + } + +static void Y(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1, + Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m, + const ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l, + int lmax, int njobs) + { + while (llmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) + { *done=1; return; } + job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; + + const ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + const dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + while (!full_ieee) + { + Y(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm), + &alm[2*njobs*l],njobs); + if (++l>lmax) break; + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + Y(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm), + &alm[2*njobs*l], njobs); + if (++l>lmax) break; + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + } + } + + if (l>lmax) + { *done=1; return; } + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Y(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l, + lmax, njobs); + } + +static void Y(calc_map2alm_spin) (Tb cth, const Ylmgen_C * restrict gen, + sharp_job *job, const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2, + int njobs, int *done) + { + int l, lmax=gen->lmax; + Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep; + Y(iter_to_ieee_spin) (cth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen); + job->opcnt += (l-gen->m) * 10*VLEN*nvec; + if (l>lmax) { *done=1; return; } + job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec; + + const ylmgen_dbl3 * restrict fx = gen->fx; + Tb corfacp,corfacm; + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + dcmplx * restrict alm=job->almtmp; + int full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + while (!full_ieee) + { + Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm); + Y(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l], njobs); + if (++l>lmax) { *done=1; return; } + Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]); + t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm); + Y(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l], njobs); + if (++l>lmax) { *done=1; return; } + Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]); + if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem)) + { + Y(getCorfac)(scalep,&corfacp,gen->cf); + Y(getCorfac)(scalem,&corfacm,gen->cf); + full_ieee = Y(TballGt)(scalep,minscale) && Y(TballGt)(scalem,minscale); + } + } + + Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp); + Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm); + Y(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax,njobs); + } + +#define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) + +static void Y(inner_loop) (sharp_job *job, const int *ispair, + const double *cth_, const double *sth_, int llim, int ulim, Ylmgen_C *gen, + int mi, const int *idx, int njobs) + { + const int nval=nvec*VLEN; + const int m = job->ainfo->mval[mi]; + Ylmgen_prepare (gen, m); + + switch (job->type) + { + case ALM2MAP: + { + if (job->spin==0) + { + int done=0; + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + } + Y(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); + } + + for (int i=0; iainfo->nm+mi)); + complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I, + r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I; + job->phase[phas_idx] = r1+r2; + if (ispair[itot]) + job->phase[phas_idx+1] = r1-r2; + } + } + } + } + } + else + { + int done=0; + for (int ith=0; ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; + } + Y(calc_alm2map_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); + } + + for (int i=0; iainfo->nm+mi)); + complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I, + q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I, + u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I, + u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I; + job->phase[phas_idx] = q1+q2; + job->phase[phas_idx+2] = u1+u2; + if (ispair[itot]) + { + dcmplx *phQ = &(job->phase[phas_idx+1]), + *phU = &(job->phase[phas_idx+3]); + *phQ = q1-q2; + *phU = u1-u2; + if ((gen->mhi-gen->m+gen->s)&1) + { *phQ=-(*phQ); *phU=-(*phU); } + } + } + } + } + } + } + break; + } + case ALM2MAP_DERIV1: + break; + case MAP2ALM: + { + if (job->spin==0) + { + int done=0; + for (int ith=0; (ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot]; + if (i+ithainfo->nm+mi)); + dcmplx ph1=job->phase[phas_idx]; + dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.; + p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2); + p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2); + } + } + } + Y(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); + } + } + else + { + int done=0; + for (int ith=0; (ith=ulim-llim) itot=ulim-llim-1; + itot=idx[itot]; + cth.s[i]=cth_[itot]; + if (i+ithainfo->nm+mi)); + dcmplx p1Q=job->phase[phas_idx], + p1U=job->phase[phas_idx+2], + p2Q=ispair[itot] ? job->phase[phas_idx+1]:0., + p2U=ispair[itot] ? job->phase[phas_idx+3]:0.; + if ((gen->mhi-gen->m+gen->s)&1) + { p2Q=-p2Q; p2U=-p2U; } + p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q); + p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U); + p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q); + p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U); + } + } + } + Y(calc_map2alm_spin) (cth.b,gen,job,&p1[0].b,&p2[0].b,njobs,&done); + } + } + break; + } + } + } + +#undef VZERO diff --git a/libsharp/sharp_geomhelpers.c b/libsharp/sharp_geomhelpers.c new file mode 100644 index 0000000..c095484 --- /dev/null +++ b/libsharp/sharp_geomhelpers.c @@ -0,0 +1,222 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_geomhelpers.c + * Spherical transform library + * + * Copyright (C) 2006-2011 Max-Planck-Society + * \author Martin Reinecke + */ + +#include +#include "sharp_geomhelpers.h" +#include "c_utils.h" + +void sharp_make_healpix_geom_info (int nside, int stride, + sharp_geom_info **geom_info) + { + double *weight=RALLOC(double,2*nside); + SET_ARRAY(weight,0,2*nside,1); + sharp_make_weighted_healpix_geom_info (nside, stride, weight, geom_info); + DEALLOC(weight); + } + +void sharp_make_weighted_healpix_geom_info (int nside, int stride, + const double *weight, sharp_geom_info **geom_info) + { + const double pi=3.141592653589793238462643383279502884197; + ptrdiff_t npix=(ptrdiff_t)nside*nside*12; + ptrdiff_t ncap=2*(ptrdiff_t)nside*(nside-1); + int nrings=4*nside-1; + + double *theta=RALLOC(double,nrings); + double *weight_=RALLOC(double,nrings); + int *nph=RALLOC(int,nrings); + double *phi0=RALLOC(double,nrings); + ptrdiff_t *ofs=RALLOC(ptrdiff_t,nrings); + int *stride_=RALLOC(int,nrings); + for (int m=0; m2*nside) ? 4*nside-ring : ring; + stride_[m] = stride; + if (northring < nside) + { + theta[m] = 2*asin(northring/(sqrt(6.)*nside)); + nph[m] = 4*northring; + phi0[m] = pi/nph[m]; + ofs[m] = 2*northring*(northring-1)*stride; + } + else + { + double fact1 = (8.*nside)/npix; + double costheta = (2*nside-northring)*fact1; + theta[m] = acos(costheta); + nph[m] = 4*nside; + if ((northring-nside) & 1) + phi0[m] = 0; + else + phi0[m] = pi/nph[m]; + ofs[m] = (ncap + (northring-nside)*nph[m])*stride; + } + if (northring != ring) /* southern hemisphere */ + { + theta[m] = pi-theta[m]; + ofs[m] = (npix - nph[m])*stride - ofs[m]; + } + weight_[m]=4.*pi/npix*weight[northring-1]; + } + + sharp_make_geom_info (nrings, nph, ofs, stride_, phi0, theta, weight_, + geom_info); + + DEALLOC(theta); + DEALLOC(weight_); + DEALLOC(nph); + DEALLOC(phi0); + DEALLOC(ofs); + DEALLOC(stride_); + } + +static void gauleg (double x1, double x2, double *x, double *w, int n) + { + const double pi = 3.141592653589793238462643383279502884197; + const double eps = 3.0E-14; + + int m = (n+1)/2; + double xm = 0.5*(x2+x1); + double xl = 0.5*(x2-x1); + for(int i=1; i<=m; ++i) + { + double z = cos(pi*(i-0.25)/(n+0.5)); + double pp; + int dobreak=0; + while(1) + { + double p1 = 1.0, p2 = 0.0; + double z1 = z; + int j; + for(j=1; j<=n; ++j) + { + double p3 = p2; + p2 = p1; + p1 = ((2*j-1)*z*p2-(j-1)*p3)/j; + } + pp = n*(z*p1-p2)/(z*z-1); + z = z1 - p1/pp; + if (dobreak) break; + if (fabs(z-z1) <= eps) dobreak=1; + } + x[i-1] = xm - xl*z; + x[n-i] = xm + xl*z; + w[i-1] = w[n-i] = 2*xl/((1-z*z)*pp*pp); + } + } + +static void makeweights (int bw, double *weights) + { + const double pi = 3.141592653589793238462643383279502884197; + const double fudge = pi/(4*bw); + for (int j=0; j<2*bw; ++j) + { + double tmpsum = 0; + for (int k=0; k=1) +#define njobs 1 +#define Z(arg) CONCAT3(arg,nvec,njobs) +#include "sharp_core_inc2.c" +#undef Z +#undef njobs +#endif + +#if (MAXJOB_SPECIAL>=2) +#define njobs 2 +#define Z(arg) CONCAT3(arg,nvec,njobs) +#include "sharp_core_inc2.c" +#undef Z +#undef njobs +#endif + +#if (MAXJOB_SPECIAL>=3) +#define njobs 3 +#define Z(arg) CONCAT3(arg,nvec,njobs) +#include "sharp_core_inc2.c" +#undef Z +#undef njobs +#endif + +#if (MAXJOB_SPECIAL>=4) +#define njobs 4 +#define Z(arg) CONCAT3(arg,nvec,njobs) +#include "sharp_core_inc2.c" +#undef Z +#undef njobs +#endif + +#if (MAXJOB_SPECIAL>=5) +#define njobs 5 +#define Z(arg) CONCAT3(arg,nvec,njobs) +#include "sharp_core_inc2.c" +#undef Z +#undef njobs +#endif + +#if (MAXJOB_SPECIAL>=6) +#define njobs 6 +#define Z(arg) CONCAT3(arg,nvec,njobs) +#include "sharp_core_inc2.c" +#undef Z +#undef njobs +#endif + +#undef Y +#undef Tb diff --git a/libsharp/sharp_mpi.c b/libsharp/sharp_mpi.c new file mode 100644 index 0000000..1827a06 --- /dev/null +++ b/libsharp/sharp_mpi.c @@ -0,0 +1,286 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_mpi.c + * Functionality only needed for MPI-parallel transforms + * + * Copyright (C) 2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifdef USE_MPI + +#include "sharp_mpi.h" + +typedef struct + { + int ntasks; /* number of tasks */ + int mytask; /* own task number */ + MPI_Comm comm; /* communicator to use */ + + int *nm; /* number of m values on every task */ + int *ofs_m; /* accumulated nm */ + int nmtotal; /* total number of m values (must be mmax+1) */ + int *mval; /* array containing all m values of task 0, task 1 etc. */ + int mmax; + int nph; + + int *npair; /* number of ring pairs on every task */ + int *ofs_pair; /* accumulated npair */ + int npairtotal; /* total number of ring pairs */ + + double *theta; /* theta of first ring of every pair on task 0, task 1 etc. */ + int *ispair; /* is this really a pair? */ + + int *almcount, *almdisp, *mapcount, *mapdisp; /* for all2all communication */ + } sharp_mpi_info; + +static void sharp_make_mpi_info (MPI_Comm comm, const sharp_job *job, + sharp_mpi_info *minfo) + { + minfo->comm = comm; + MPI_Comm_size (comm, &minfo->ntasks); + MPI_Comm_rank (comm, &minfo->mytask); + + minfo->nm=RALLOC(int,minfo->ntasks); + MPI_Allgather ((int *)(&job->ainfo->nm),1,MPI_INT,minfo->nm,1,MPI_INT,comm); + minfo->ofs_m=RALLOC(int,minfo->ntasks+1); + minfo->ofs_m[0]=0; + for (int i=1; i<=minfo->ntasks; ++i) + minfo->ofs_m[i] = minfo->ofs_m[i-1]+minfo->nm[i-1]; + minfo->nmtotal=minfo->ofs_m[minfo->ntasks]; + minfo->mval=RALLOC(int,minfo->nmtotal); + MPI_Allgatherv(job->ainfo->mval, job->ainfo->nm, MPI_INT, minfo->mval, + minfo->nm, minfo->ofs_m, MPI_INT, comm); + + minfo->mmax=sharp_get_mmax(minfo->mval,minfo->nmtotal); + + minfo->npair=RALLOC(int,minfo->ntasks); + MPI_Allgather ((int *)(&job->ginfo->npairs), 1, MPI_INT, minfo->npair, 1, + MPI_INT, comm); + minfo->ofs_pair=RALLOC(int,minfo->ntasks+1); + minfo->ofs_pair[0]=0; + for (int i=1; i<=minfo->ntasks; ++i) + minfo->ofs_pair[i] = minfo->ofs_pair[i-1]+minfo->npair[i-1]; + minfo->npairtotal=minfo->ofs_pair[minfo->ntasks]; + + double *theta_tmp=RALLOC(double,job->ginfo->npairs); + int *ispair_tmp=RALLOC(int,job->ginfo->npairs); + for (int i=0; iginfo->npairs; ++i) + { + theta_tmp[i]=job->ginfo->pair[i].r1.theta; + ispair_tmp[i]=job->ginfo->pair[i].r2.nph>0; + } + minfo->theta=RALLOC(double,minfo->npairtotal); + minfo->ispair=RALLOC(int,minfo->npairtotal); + MPI_Allgatherv(theta_tmp, job->ginfo->npairs, MPI_DOUBLE, minfo->theta, + minfo->npair, minfo->ofs_pair, MPI_DOUBLE, comm); + MPI_Allgatherv(ispair_tmp, job->ginfo->npairs, MPI_INT, minfo->ispair, + minfo->npair, minfo->ofs_pair, MPI_INT, comm); + DEALLOC(theta_tmp); + DEALLOC(ispair_tmp); + + minfo->nph=2*job->nmaps*job->ntrans; + + minfo->almcount=RALLOC(int,minfo->ntasks); + minfo->almdisp=RALLOC(int,minfo->ntasks+1); + minfo->mapcount=RALLOC(int,minfo->ntasks); + minfo->mapdisp=RALLOC(int,minfo->ntasks+1); + minfo->almdisp[0]=minfo->mapdisp[0]=0; + for (int i=0; intasks; ++i) + { + minfo->almcount[i] = 2*minfo->nph*minfo->nm[minfo->mytask]*minfo->npair[i]; + minfo->almdisp[i+1] = minfo->almdisp[i]+minfo->almcount[i]; + minfo->mapcount[i] = 2*minfo->nph*minfo->nm[i]*minfo->npair[minfo->mytask]; + minfo->mapdisp[i+1] = minfo->mapdisp[i]+minfo->mapcount[i]; + } + } + +static void sharp_destroy_mpi_info (sharp_mpi_info *minfo) + { + DEALLOC(minfo->nm); + DEALLOC(minfo->ofs_m); + DEALLOC(minfo->mval); + DEALLOC(minfo->npair); + DEALLOC(minfo->ofs_pair); + DEALLOC(minfo->theta); + DEALLOC(minfo->ispair); + DEALLOC(minfo->almcount); + DEALLOC(minfo->almdisp); + DEALLOC(minfo->mapcount); + DEALLOC(minfo->mapdisp); + } + +static void sharp_communicate_alm2map (const sharp_mpi_info *minfo, dcmplx **ph) + { + dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2); + + MPI_Alltoallv (*ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,phas_tmp, + minfo->mapcount,minfo->mapdisp,MPI_DOUBLE,minfo->comm); + + DEALLOC(*ph); + ALLOC(*ph,dcmplx,minfo->nph*minfo->npair[minfo->mytask]*minfo->nmtotal); + + for (int task=0; taskntasks; ++task) + for (int th=0; thnpair[minfo->mytask]; ++th) + for (int mi=0; minm[task]; ++mi) + { + int m = minfo->mval[mi+minfo->ofs_m[task]]; + int o1 = minfo->nph*(th*(minfo->mmax+1) + m); + int o2 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]); + for (int i=0; inph; ++i) + (*ph)[o1+i] = phas_tmp[o2+i]; + } + DEALLOC(phas_tmp); + } + +static void sharp_communicate_map2alm (const sharp_mpi_info *minfo, dcmplx **ph) + { + dcmplx *phas_tmp = RALLOC(dcmplx,minfo->mapdisp[minfo->ntasks]/2); + + for (int task=0; taskntasks; ++task) + for (int th=0; thnpair[minfo->mytask]; ++th) + for (int mi=0; minm[task]; ++mi) + { + int m = minfo->mval[mi+minfo->ofs_m[task]]; + int o1 = minfo->mapdisp[task]/2+minfo->nph*(mi+th*minfo->nm[task]); + int o2 = minfo->nph*(th*(minfo->mmax+1) + m); + for (int i=0; inph; ++i) + phas_tmp[o1+i] = (*ph)[o2+i]; + } + + DEALLOC(*ph); + ALLOC(*ph,dcmplx,minfo->nph*minfo->nm[minfo->mytask]*minfo->npairtotal); + + MPI_Alltoallv (phas_tmp,minfo->mapcount,minfo->mapdisp,MPI_DOUBLE, + *ph,minfo->almcount,minfo->almdisp,MPI_DOUBLE,minfo->comm); + + DEALLOC(phas_tmp); + } + +static void alloc_phase_mpi (sharp_job *job, int nm, int ntheta, + int nmfull, int nthetafull) + { + ptrdiff_t phase_size = (job->type==MAP2ALM) ? + (ptrdiff_t)(nmfull)*ntheta : (ptrdiff_t)(nm)*nthetafull; + job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*phase_size); + } + +static void alm2map_comm (sharp_job *job, const sharp_mpi_info *minfo) + { + if (job->type != MAP2ALM) + sharp_communicate_alm2map (minfo,&job->phase); + } + +static void map2alm_comm (sharp_job *job, const sharp_mpi_info *minfo) + { + if (job->type == MAP2ALM) + sharp_communicate_map2alm (minfo,&job->phase); + } + +void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm) + { + double timer=wallTime(); + int ntasks; + MPI_Comm_size(comm, &ntasks); + if (ntasks==1) /* fall back to scalar implementation */ + { sharp_execute_job (job); return; } + + int lmax = job->ainfo->lmax; + + job->norm_l = Ylmgen_get_norm (lmax, job->spin); + + sharp_mpi_info minfo; + sharp_make_mpi_info(comm, job, &minfo); + +/* clear output arrays if requested */ + init_output (job); + + alloc_phase_mpi (job,job->ainfo->nm,job->ginfo->npairs,minfo.mmax+1, + minfo.npairtotal); + + double *cth = RALLOC(double,minfo.npairtotal), + *sth = RALLOC(double,minfo.npairtotal); + idxhelper *stmp = RALLOC(idxhelper,minfo.npairtotal); + for (int i=0; iphase where necessary */ + map2phase (job, minfo.mmax, 0, job->ginfo->npairs); + + map2alm_comm (job, &minfo); + +#pragma omp parallel +{ + sharp_job ljob = *job; + Ylmgen_C generator; + Ylmgen_init (&generator,lmax,minfo.mmax,ljob.spin); + alloc_almtmp(&ljob,lmax); + +#pragma omp for schedule(dynamic,1) + for (int mi=0; miainfo->nm; ++mi) + { +/* alm->alm_tmp where necessary */ + alm2almtmp (&ljob, lmax, mi); + +/* inner conversion loop */ + inner_loop (&ljob, minfo.ispair, cth, sth, 0, minfo.npairtotal, + &generator, mi, idx); + +/* alm_tmp->alm where necessary */ + almtmp2alm (&ljob, lmax, mi); + } + + Ylmgen_destroy(&generator); + dealloc_almtmp(&ljob); + +#pragma omp critical + job->opcnt+=ljob.opcnt; +} /* end of parallel region */ + + alm2map_comm (job, &minfo); + +/* phase->map where necessary */ + phase2map (job, minfo.mmax, 0, job->ginfo->npairs); + + DEALLOC(cth); + DEALLOC(sth); + DEALLOC(idx); + DEALLOC(job->norm_l); + dealloc_phase (job); + sharp_destroy_mpi_info(&minfo); + job->time=wallTime()-timer; + } + +#endif diff --git a/libsharp/sharp_mpi.h b/libsharp/sharp_mpi.h new file mode 100644 index 0000000..3bef24a --- /dev/null +++ b/libsharp/sharp_mpi.h @@ -0,0 +1,48 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_mpi.h + * Interface for the spherical transform library with MPI support. + * + * Copyright (C) 2011,2012 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef PLANCK_SHARP_MPI_H +#define PLANCK_SHARP_MPI_H + +#include +#include "sharp.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void sharp_execute_job_mpi (sharp_job *job, MPI_Comm comm); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libsharp/sharp_test.c b/libsharp/sharp_test.c new file mode 100644 index 0000000..6bcd253 --- /dev/null +++ b/libsharp/sharp_test.c @@ -0,0 +1,243 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_test.c + Accuracy test for libsharp's map analysis. + + This program first generates a_lm coefficients up to + a user-specified lmax (with mmax=lmax); where applicable, the + real and imaginary parts of the coefficients are uniform + random numbers of the interval [-1;1[. + Afterwards, the random a_lm are converted to a map. + This map is analyzed (optionally using an iterative scheme + with a user-supplied number of steps). + After every iteration, the code then outputs the RMS of the residual a_lm + (i.e. the difference between the current and original a_lm), divided by + the RMS of the original a_lm, as well as the maximum absolute change of any + real or imaginary part between the current and original a_lm. + + This operation can be performed for several different pixelisations: + - a Gaussian with the minimal number of rings for exact analysis + and a user-defined ring resolution + - an ECP grid with the minimal number of rings for exact analysis + and a user-defined ring resolution + - a Healpix grid with a user-defined Nside parameter. + + The user can specify the spin of the desired transform. + + Copyright (C) 2006-2012 Max-Planck-Society + \author Martin Reinecke +*/ + +#include +#include +#ifdef USE_MPI +#include "mpi.h" +#endif +#include "sharp.h" +#include "sharp_geomhelpers.h" +#include "sharp_almhelpers.h" +#include "c_utils.h" +#include "sharp_core.h" + +typedef complex double dcmplx; + +static double drand (double min, double max) + { return min + (max-min)*rand()/(RAND_MAX+1.0); } + +static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin) + { + for (int mi=0;minm; ++mi) + { + int m=helper->mval[mi]; + for (int l=m;l<=helper->lmax; ++l) + { + if ((lmaxdiff) maxdiff=fabs(x); + if (fabs(y)>maxdiff) maxdiff=fabs(y); + } + sum=sqrt(sum/nalms); + sum2=sqrt(sum2/nalms); + printf("component %i: rms %e, maxerr %e\n",i, sum/sum2, maxdiff); + } + } + +static void map2alm_iter (sharp_geom_info *tinfo, double **map, + dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax, + ptrdiff_t npix, ptrdiff_t nalms, int spin, int ntrans, int niter) + { + int ncomp = ntrans*((spin==0) ? 1 : 2); + + sharp_alm_info *alms; + sharp_make_triangular_alm_info(lmax,mmax,1,&alms); + + sharp_job job; + sharpd_build_job(&job,MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans); + sharp_execute_job(&job); + printf("wall time for map2alm: %fs\n",job.time); + printf("Performance: %fGFLOPs/s\n",1e-9*job.opcnt/job.time); + measure_errors(alm_orig,alm,nalms,ncomp); + + for (int iter=0; iter ",1); + + int lmax=atoi(argv[2]); + int niter=atoi(argv[4]); + int spin=atoi(argv[5]); + int ntrans=atoi(argv[6]); + + printf("Testing map analysis accuracy.\n"); + printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin); + + sharp_geom_info *tinfo; + if (strcmp(argv[1],"gauss")==0) + { + int nrings=lmax+1; + int ppring=atoi(argv[3]); + ptrdiff_t npix=(ptrdiff_t)nrings*ppring; + printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n", + nrings,ppring,(long)npix); + sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo); + check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter); + sharp_destroy_geom_info(tinfo); + } + else if (strcmp(argv[1],"ecp")==0) + { + int nrings=2*lmax+2; + int ppring=atoi(argv[3]); + ptrdiff_t npix=(ptrdiff_t)nrings*ppring; + printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n", + nrings,ppring,(long)npix); + sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo); + check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter); + sharp_destroy_geom_info(tinfo); + } + else if (strcmp(argv[1],"healpix")==0) + { + int nside=atoi(argv[3]); + if (nside<1) nside=1; + ptrdiff_t npix=12*(ptrdiff_t)nside*nside; + printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n", + nside,(long)npix); + sharp_make_healpix_geom_info (nside, 1, &tinfo); + check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter); + sharp_destroy_geom_info(tinfo); + } + else + UTIL_FAIL("unknown grid geometry"); + +#ifdef USE_MPI + MPI_Finalize(); +#endif + return 0; + } diff --git a/libsharp/sharp_test_mpi.c b/libsharp/sharp_test_mpi.c new file mode 100644 index 0000000..e8bd79b --- /dev/null +++ b/libsharp/sharp_test_mpi.c @@ -0,0 +1,354 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/*! \file sharp_test_mpi.c + Accuracy test for libsharp's map analysis with MPI support. + + This program first generates a_lm coefficients up to + a user-specified lmax (with mmax=lmax); where applicable, the + real and imaginary parts of the coefficients are uniform + random numbers of the interval [-1;1[. + Afterwards, the random a_lm are converted to a map. + This map is analyzed (optionally using an iterative scheme + with a user-supplied number of steps). + After every iteration, the code then outputs the RMS of the residual a_lm + (i.e. the difference between the current and original a_lm), divided by + the RMS of the original a_lm, as well as the maximum absolute change of any + real or imaginary part between the current and original a_lm. + + This operation can be performed for several different pixelisations: + - a Gaussian with the minimal number of rings for exact analysis + and a user-defined ring resolution + - an ECP grid with the minimal number of rings for exact analysis + and a user-defined ring resolution + - a Healpix grid with a user-defined Nside parameter. + + The user can specify the spin of the desired transform. + + Copyright (C) 2006-2012 Max-Planck-Society + \author Martin Reinecke +*/ + +#ifdef USE_MPI + +#include +#include +#include "sharp_mpi.h" +#include "sharp_geomhelpers.h" +#include "sharp_almhelpers.h" +#include "c_utils.h" +#include "walltime_c.h" +#include "sharp_core.h" + +typedef complex double dcmplx; + +int ntasks, mytask; + +static unsigned long long totalops (unsigned long long val) + { + unsigned long long tmp; + MPI_Allreduce (&val, &tmp,1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); + return tmp; + } + +static double maxTime (double val) + { + double tmp; + MPI_Allreduce (&val, &tmp,1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + return tmp; + } + +static double drand (double min, double max) + { return min + (max-min)*rand()/(RAND_MAX+1.0); } + +static ptrdiff_t get_nalms(const sharp_alm_info *ainfo) + { + ptrdiff_t res=0; + for (int i=0; inm; ++i) + res += ainfo->lmax-ainfo->mval[i]+1; + return res; + } + +static ptrdiff_t get_npix(const sharp_geom_info *ginfo) + { + ptrdiff_t res=0; + for (int i=0; inpairs; ++i) + { + res += ginfo->pair[i].r1.nph; + if (ginfo->pair[i].r2.nph>0) res += ginfo->pair[i].r2.nph; + } + return res; + } + +static void reduce_alm_info(sharp_alm_info *ainfo) + { + int nmnew=0; + ptrdiff_t ofs = 0; + for (int i=mytask; inm; i+=ntasks,++nmnew) + { + ainfo->mval[nmnew]=ainfo->mval[i]; + ainfo->mvstart[nmnew]=ofs-ainfo->mval[nmnew]; + ofs+=ainfo->lmax-ainfo->mval[nmnew]+1; + } + ainfo->nm=nmnew; + } + +static void reduce_geom_info(sharp_geom_info *ginfo) + { + int npairsnew=0; + ptrdiff_t ofs = 0; + for (int i=mytask; inpairs; i+=ntasks,++npairsnew) + { + ginfo->pair[npairsnew]=ginfo->pair[i]; + ginfo->pair[npairsnew].r1.ofs=ofs; + ofs+=ginfo->pair[npairsnew].r1.nph; + ginfo->pair[npairsnew].r2.ofs=ofs; + if (ginfo->pair[npairsnew].r2.nph>0) ofs+=ginfo->pair[npairsnew].r2.nph; + } + ginfo->npairs=npairsnew; + } + +static void random_alm (dcmplx *alm, sharp_alm_info *helper, int spin) + { + static int cnt=0; + ++cnt; + for (int mi=0;minm; ++mi) + { + int m=helper->mval[mi]; + srand(1234567*cnt+8912*m); + for (int l=m;l<=helper->lmax; ++l) + { + if ((lnm; ++mi) + { + int m=ainfo->mval[mi]; + for (int l=m; l<=ainfo->lmax; ++l) + { + ptrdiff_t idx=sharp_alm_index(ainfo,l,mi); + double x=creal(alm[i][idx])-creal(alm2[i][idx]), + y=cimag(alm[i][idx])-cimag(alm2[i][idx]); + sum+=x*x+y*y; + sum2+=creal(alm[i][idx])*creal(alm[i][idx]) + +cimag(alm[i][idx])*cimag(alm[i][idx]); + if (fabs(x)>maxdiff) maxdiff=fabs(x); + if (fabs(y)>maxdiff) maxdiff=fabs(y); + } + } + + MPI_Allreduce(&sum,&sumtot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); + MPI_Allreduce(&sum2,&sum2tot,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); + MPI_Allreduce(&maxdiff,&maxdifftot,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); + sumtot=sqrt(sumtot/nalms_tot); + sum2tot=sqrt(sum2tot/nalms_tot); + if (mytask==0) + printf("component %i: rms %e, maxerr %e\n",i, sumtot/sum2tot, maxdifftot); + } + } + +static void map2alm_iter (sharp_geom_info *tinfo, double **map, + dcmplx **alm_orig, dcmplx **alm, int lmax, int mmax, + ptrdiff_t npix, int spin, int ntrans, int niter) + { + int ncomp = ntrans*((spin==0) ? 1 : 2); + + sharp_alm_info *alms; + sharp_make_triangular_alm_info(lmax,mmax,1,&alms); + reduce_alm_info(alms); + + sharp_job job; + sharpd_build_job(&job,MAP2ALM,spin,0,&alm[0],&map[0],tinfo,alms,ntrans); + sharp_execute_job_mpi(&job,MPI_COMM_WORLD); + unsigned long long opcnt=totalops(job.opcnt); + double timer=maxTime(job.time); + if (mytask==0) printf("wall time for map2alm: %fs\n",timer); + if (mytask==0) printf("Performance: %fGFLOPs/s\n",1e-9*opcnt/timer); + measure_errors(alm_orig,alm,alms,ncomp); + + for (int iter=0; iter ", + mytask==0); + int lmax=atoi(argv[2]); + int niter=atoi(argv[4]); + int spin=atoi(argv[5]); + int ntrans=atoi(argv[6]); + + if (mytask==0) + { + printf("Testing map analysis accuracy.\n"); + printf("lmax=%d, %d iterations, spin=%d\n", lmax, niter, spin); + } + + sharp_geom_info *tinfo; + if (strcmp(argv[1],"gauss")==0) + { + int nrings=lmax+1; + int ppring=atoi(argv[3]); + ptrdiff_t npix=(ptrdiff_t)nrings*ppring; + if (mytask==0) + printf("\nTesting Gaussian grid (%d rings, %d pixels/ring, %ld pixels)\n", + nrings,ppring,(long)npix); + sharp_make_gauss_geom_info (nrings, ppring, 1, ppring, &tinfo); + reduce_geom_info(tinfo); + npix=get_npix(tinfo); + check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter); + sharp_destroy_geom_info(tinfo); + } + else if (strcmp(argv[1],"ecp")==0) + { + int nrings=2*lmax+2; + int ppring=atoi(argv[3]); + ptrdiff_t npix=(ptrdiff_t)nrings*ppring; + if (mytask==0) + printf("\nTesting ECP grid (%d rings, %d pixels/ring, %ld pixels)\n", + nrings,ppring,(long)npix); + sharp_make_ecp_geom_info (nrings, ppring, 0., 1, ppring, &tinfo); + reduce_geom_info(tinfo); + npix=get_npix(tinfo); + check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter); + sharp_destroy_geom_info(tinfo); + } + else if (strcmp(argv[1],"healpix")==0) + { + int nside=atoi(argv[3]); + if (nside<1) nside=1; + ptrdiff_t npix=12*(ptrdiff_t)nside*nside; + if (mytask==0) + printf("\nTesting Healpix grid (nside=%d, %ld pixels)\n", + nside,(long)npix); + sharp_make_healpix_geom_info (nside, 1, &tinfo); + reduce_geom_info(tinfo); + npix=get_npix(tinfo); + check_accuracy(tinfo,lmax,lmax,npix,spin,ntrans,niter); + sharp_destroy_geom_info(tinfo); + } + else + UTIL_FAIL("unknown grid geometry"); + + MPI_Finalize(); + return 0; + } + +#else + +#include "c_utils.h" + +int main(void) + { UTIL_FAIL("MPI support not enabled."); return 1; } + +#endif diff --git a/libsharp/vecsupport.h b/libsharp/vecsupport.h new file mode 100644 index 0000000..ccb9364 --- /dev/null +++ b/libsharp/vecsupport.h @@ -0,0 +1,158 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* \file vecsupport.h + * Convenience functions for vector arithmetics + * + * Copyright (C) 2012 Max-Planck-Society + * Author: Martin Reinecke + */ + +#ifndef VECSUPPORT_H +#define VECSUPPORT_H + +#include +#include "vec_utils.h" + +typedef double Ts; + +#if (VLEN==1) + +typedef double Tv; + +#define vadd(a,b) ((a)+(b)) +#define vaddeq(a,b) ((a)+=(b)) +#define vsub(a,b) ((a)-(b)) +#define vsubeq(a,b) ((a)-=(b)) +#define vmul(a,b) ((a)*(b)) +#define vmuleq(a,b) ((a)*=(b)) +#define vfmaeq(a,b,c) ((a)+=(b)*(c)) +#define vfmseq(a,b,c) ((a)-=(b)*(c)) +#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e)) +#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e)) +#define vneg(a) (-(a)) +#define vload(a) (a) +#define vabs(a) fabs(a) +#define vsqrt(a) sqrt(a) +#define vlt(a,b) (((a)<(b))?1.:0.) +#define vgt(a,b) (((a)>(b))?1.:0.) +#define vne(a,b) (((a)!=(b))?1.:0.) +#define vand(a,b) ((((a)*(b))!=0.)?1.:0.) + +static inline Tv vmin (Tv a, Tv b) { return (ab) ? a : b; } + +#define vanyTrue(a) ((a)!=0.) +#define vallTrue(a) ((a)!=0.) +#define vblend(m,a,b) (((m)!=0.) ? (a) : (b)) +#define vzero 0. +#define vone 1. + +#endif + +#if (VLEN==2) + +#include + +#if defined (__SSE3__) +#include +#endif +#if defined (__SSE4_1__) +#include +#endif + +typedef __m128d Tv; + +#define vadd(a,b) _mm_add_pd(a,b) +#define vaddeq(a,b) a=_mm_add_pd(a,b) +#define vsub(a,b) _mm_sub_pd(a,b) +#define vsubeq(a,b) a=_mm_sub_pd(a,b) +#define vmul(a,b) _mm_mul_pd(a,b) +#define vmuleq(a,b) a=_mm_mul_pd(a,b) +#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c)) +#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c)) +#define vfmaaeq(a,b,c,d,e) \ + a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) +#define vfmaseq(a,b,c,d,e) \ + a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e))) +#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a) +#define vload(a) _mm_set1_pd(a) +#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a) +#define vsqrt(a) _mm_sqrt_pd(a) +#define vlt(a,b) _mm_cmplt_pd(a,b) +#define vgt(a,b) _mm_cmpgt_pd(a,b) +#define vne(a,b) _mm_cmpneq_pd(a,b) +#define vand(a,b) _mm_and_pd(a,b) +#define vmin(a,b) _mm_min_pd(a,b) +#define vmax(a,b) _mm_max_pd(a,b); +#define vanyTrue(a) (_mm_movemask_pd(a)!=0) +#define vallTrue(a) (_mm_movemask_pd(a)==3) +#if defined(__SSE4_1__) +#define vblend(m,a,b) _mm_blendv_pd(b,a,m) +#else +static inline Tv vblend(Tv m, Tv a, Tv b) + { return _mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); } +#endif +#define vzero _mm_setzero_pd() +#define vone _mm_set1_pd(1.) + +#endif + +#if (VLEN==4) + +#include + +typedef __m256d Tv; + +#define vadd(a,b) _mm256_add_pd(a,b) +#define vaddeq(a,b) a=_mm256_add_pd(a,b) +#define vsub(a,b) _mm256_sub_pd(a,b) +#define vsubeq(a,b) a=_mm256_sub_pd(a,b) +#define vmul(a,b) _mm256_mul_pd(a,b) +#define vmuleq(a,b) a=_mm256_mul_pd(a,b) +#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c)) +#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c)) +#define vfmaaeq(a,b,c,d,e) \ + a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) +#define vfmaseq(a,b,c,d,e) \ + a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e))) +#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a) +#define vload(a) _mm256_set1_pd(a) +#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a) +#define vsqrt(a) _mm256_sqrt_pd(a) +#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ) +#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ) +#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ) +#define vand(a,b) _mm256_and_pd(a,b) +#define vmin(a,b) _mm256_min_pd(a,b) +#define vmax(a,b) _mm256_max_pd(a,b) +#define vanyTrue(a) (_mm256_movemask_pd(a)!=0) +#define vallTrue(a) (_mm256_movemask_pd(a)==15) +#define vblend(m,a,b) _mm256_blendv_pd(b,a,m) +#define vzero _mm256_setzero_pd() +#define vone _mm256_set1_pd(1.) + +#endif + +#endif diff --git a/libsharp/ylmgen_c.c b/libsharp/ylmgen_c.c new file mode 100644 index 0000000..e674e63 --- /dev/null +++ b/libsharp/ylmgen_c.c @@ -0,0 +1,206 @@ +/* + * This file is part of libsharp. + * + * libsharp is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libsharp is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libsharp; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + * Helper code for efficient calculation of Y_lm(theta,phi=0) + * + * Copyright (C) 2005-2012 Max-Planck-Society + * Author: Martin Reinecke + */ + +#include +#include +#include "ylmgen_c.h" +#include "c_utils.h" + +void Ylmgen_init (Ylmgen_C *gen, int l_max, int m_max, int spin) + { + const double inv_sqrt4pi = 0.2820947917738781434740397257803862929220; + + gen->lmax = l_max; + gen->mmax = m_max; + UTIL_ASSERT(spin>=0,"incorrect spin"); + gen->s = spin; + UTIL_ASSERT((minscale<=0)&&(maxscale>0),"bad value for min/maxscale"); + gen->cf=RALLOC(double,maxscale-minscale+1); + gen->cf[-minscale]=1.; + for (int m=-minscale-1; m>=0; --m) + gen->cf[m]=gen->cf[m+1]*fsmall; + for (int m=-minscale+1; m<(maxscale-minscale+1); ++m) + gen->cf[m]=gen->cf[m-1]*fbig; + + gen->m = -1; + if (spin==0) + { + gen->rf = RALLOC(ylmgen_dbl2,gen->lmax+1); + gen->mfac = RALLOC(double,gen->mmax+1); + gen->mfac[0] = inv_sqrt4pi; + for (int m=1; m<=gen->mmax; ++m) + gen->mfac[m] = gen->mfac[m-1]*sqrt((2*m+1.)/(2*m)); + gen->root = RALLOC(double,2*gen->lmax+5); + gen->iroot = RALLOC(double,2*gen->lmax+5); + for (int m=0; m<2*gen->lmax+5; ++m) + { + gen->root[m] = sqrt(m); + gen->iroot[m] = (m==0) ? 0. : 1./gen->root[m]; + } + } + else + { + gen->m=gen->mlo=gen->mhi=-1234567890; + ALLOC(gen->fx,ylmgen_dbl3,gen->lmax+2); + for (int m=0; mlmax+2; ++m) + gen->fx[m].f[0]=gen->fx[m].f[1]=gen->fx[m].f[2]=0.; + ALLOC(gen->inv,double,gen->lmax+1); + gen->inv[0]=0; + for (int m=1; mlmax+1; ++m) gen->inv[m]=1./m; + ALLOC(gen->flm1,double,2*gen->lmax+1); + ALLOC(gen->flm2,double,2*gen->lmax+1); + for (int m=0; m<2*gen->lmax+1; ++m) + { + gen->flm1[m] = sqrt(1./(m+1.)); + gen->flm2[m] = sqrt(m/(m+1.)); + } + ALLOC(gen->prefac,double,gen->mmax+1); + ALLOC(gen->fscale,int,gen->mmax+1); + double *fac = RALLOC(double,2*gen->lmax+1); + int *facscale = RALLOC(int,2*gen->lmax+1); + fac[0]=1; facscale[0]=0; + for (int m=1; m<2*gen->lmax+1; ++m) + { + fac[m]=fac[m-1]*sqrt(m); + facscale[m]=facscale[m-1]; + if (fac[m]>1.) { fac[m]*=fsmall; ++facscale[m]; } + } + for (int m=0; m<=gen->mmax; ++m) + { + int mlo=gen->s, mhi=m; + if (mhiprefac[m]=fac[2*mhi]/(fac[mhi+mlo]*fac[mhi-mlo]); + gen->fscale[m]=facscale[2*mhi]-facscale[mhi+mlo]-facscale[mhi-mlo]; + } + DEALLOC(fac); + DEALLOC(facscale); + } + } + +void Ylmgen_destroy (Ylmgen_C *gen) + { + DEALLOC(gen->cf); + if (gen->s==0) + { + DEALLOC(gen->rf); + DEALLOC(gen->mfac); + DEALLOC(gen->root); + DEALLOC(gen->iroot); + } + else + { + DEALLOC(gen->fx); + DEALLOC(gen->prefac); + DEALLOC(gen->fscale); + DEALLOC(gen->flm1); + DEALLOC(gen->flm2); + DEALLOC(gen->inv); + } + } + +void Ylmgen_prepare (Ylmgen_C *gen, int m) + { + if (m==gen->m) return; + UTIL_ASSERT(m>=0,"incorrect m"); + gen->m = m; + + if (gen->s==0) + { + gen->rf[m].f[0] = gen->root[2*m+3]; + gen->rf[m].f[1] = 0.; + for (int l=m+1; l<=gen->lmax; ++l) + { + double tmp=gen->root[2*l+3]*gen->iroot[l+1+m]*gen->iroot[l+1-m]; + gen->rf[l].f[0] = tmp*gen->root[2*l+1]; + gen->rf[l].f[1] = tmp*gen->root[l+m]*gen->root[l-m]*gen->iroot[2*l-1]; + } + } + else + { + int mlo_=m, mhi_=gen->s; + if (mhi_mhi==mhi_) && (gen->mlo==mlo_)); + + gen->mlo = mlo_; gen->mhi = mhi_; + + if (!ms_similar) + { + for (int l=gen->mhi; llmax; ++l) + { + double t = gen->flm1[l+gen->m]*gen->flm1[l-gen->m] + *gen->flm1[l+gen->s]*gen->flm1[l-gen->s]; + double lt = 2*l+1; + double l1 = l+1; + gen->fx[l+1].f[0]=l1*lt*t; + gen->fx[l+1].f[1]=gen->m*gen->s*gen->inv[l]*gen->inv[l+1]; + t = gen->flm2[l+gen->m]*gen->flm2[l-gen->m] + *gen->flm2[l+gen->s]*gen->flm2[l-gen->s]; + gen->fx[l+1].f[2]=t*l1*gen->inv[l]; + } + } + + gen->preMinus_p = gen->preMinus_m = 0; + if (gen->mhi==gen->m) + { + gen->cosPow = gen->mhi+gen->s; gen->sinPow = gen->mhi-gen->s; + gen->preMinus_p = gen->preMinus_m = ((gen->mhi-gen->s)&1); + } + else + { + gen->cosPow = gen->mhi+gen->m; gen->sinPow = gen->mhi-gen->m; + gen->preMinus_m = ((gen->mhi+gen->m)&1); + } + } + } + +double *Ylmgen_get_norm (int lmax, int spin) + { + const double pi = 3.141592653589793238462643383279502884197; + double *res=RALLOC(double,lmax+1); + /* sign convention for H=1 (LensPix paper) */ +#if 1 + double spinsign = (spin>0) ? -1.0 : 1.0; +#else + double spinsign = 1.0; +#endif + + if (spin==0) + { + for (int l=0; l<=lmax; ++l) + res[l]=1.; + return res; + } + + spinsign = (spin&1) ? -spinsign : spinsign; + for (int l=0; l<=lmax; ++l) + res[l] = (l