forked from n1gp/rtl_hpsdr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdownsample.c
211 lines (173 loc) · 5.55 KB
/
downsample.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/* This file downsample.c is part of rtl_hpsdr.
*
* rtl_hpsdr - an RTL to HPSDR software translation server
* Copyright (C) 2014 Richard Koch
*
* rtl_hpsdr is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* rtl_hpsdr is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with rtl_hpsdr. If not, see <http://www.gnu.org/licenses/>.
*/
#include "rtl_hpsdr.h"
#ifdef INCLUDE_NEON
#define vector_type float32x4_t
#define vector_zero vmovq_n_f32(0)
#define vector_mac(x,y,z) vmlaq_f32((x), vld1q_f32((y)), (z))
#define vector_store(x,y,z) vst1q_f32((x), vaddq_f32((y), vcombine_f32(vget_low_f32((z)), vget_high_f32((y)))))
#elif defined INCLUDE_SSE2
#define vector_type __m128
#define vector_zero _mm_setzero_ps()
#define vector_mac(x,y,z) _mm_add_ps((x), _mm_mul_ps(_mm_load_ps((y)), (z)))
#define vector_store(x,y,z) _mm_store_ps((x), _mm_add_ps((y), _mm_shuffle_ps((y), (y), _MM_SHUFFLE(1,0,3,2))))
#else
#define vector_type float
#endif
void
down_loop(struct rcvr_cb* rcb, int pass) {
int i, j, k = 0;
int dsample, count, coeff_len;
bool IQ_swap;
float* buf, *out;
float* pSrc, *orig_buf;
const vector_type* pFil;
vector_type sum1;//, sum2;
struct main_cb* mcb = rcb->mcb;
if(1 == pass) {
// 1st pass filter is hb for 768000
IQ_swap = true;
dsample = 2;
count = RTL_READ_COUNT;
coeff_len = COEFF3072_H_16_LENGTH;
buf = &(rcb->iq_buf[0]);
out = &(rcb->iq_buf_final[COEFF1536_H_32_LENGTH * 2]);
} else {
IQ_swap = false;
count = RTL_READ_COUNT / 2;
coeff_len = mcb->length_fir;
buf = &(rcb->iq_buf_final[0]);
out = &(rcb->iqSamples[rcb->iqSamples_remaining * 2]);
switch(mcb->output_rate) {
case 48000:
dsample = DOWNSAMPLE_192 * 2;
break;
case 96000:
dsample = DOWNSAMPLE_192;
break;
case 192000:
dsample = DOWNSAMPLE_192 / 2;
break;
case 384000:
dsample = DOWNSAMPLE_192 / 4;
break;
}
}
orig_buf = buf;
#if defined(INCLUDE_NEON) || defined(INCLUDE_SSE2)
// filter is evaluated for two I/Q samples with each iteration, thus use of 'j += 2'
for(j = 0; j < count / 2; j += 2) {
pSrc = buf;
// filter coefficients. NOTE: Assumes coefficients are aligned to 16-byte boundary
if(1 == pass)
pFil = (const vector_type*) mcb->align3072_768_H;
else {
switch(mcb->output_rate) {
case 48000:
pFil = (const vector_type*) mcb->align1536_48_H;
break;
case 96000:
pFil = (const vector_type*) mcb->align1536_96_H;
break;
case 192000:
pFil = (const vector_type*) mcb->align1536_192_H;
break;
case 384000:
pFil = (const vector_type*) mcb->align1536_384_H;
break;
}
}
sum1 = vector_zero;
// sum1 = sum2 = vector_zero;
for(i = 0; i < coeff_len / 8; i++) {
// Unroll loop for efficiency & calculate filter for 2*2 I/Q samples
// at each pass
// sum1 is accu for 2*2 filtered I/Q data at the primary data offset
// sum2 is accu for 2*2 filtered I/Q data for the next sample offset.
sum1 = vector_mac(sum1, pSrc, pFil[0]);
// sum2 = vector_mac(sum2, pSrc+2, pFil[0]);
sum1 = vector_mac(sum1, pSrc + 4, pFil[1]);
// sum2 = vector_mac(sum2, pSrc+6, pFil[1]);
sum1 = vector_mac(sum1, pSrc + 8, pFil[2]);
// sum2 = vector_mac(sum2, pSrc+10, pFil[2]);
sum1 = vector_mac(sum1, pSrc + 12, pFil[3]);
// sum2 = vector_mac(sum2, pSrc+14, pFil[3]);
pSrc += 16;
pFil += 4;
}
buf += 4;
// Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
// to sum the two hi- and lo-floats of these registers together. Only perform this
// if we actually need it for the downsampled output data.
// UPDATE: since we're throwing away sum2, don't bother calculating it.
if(0 == ((j + 2) % dsample)) {
// post-shuffle & add the filtered values and store to dest.
vector_store(rcb->dest, sum1, sum1);
#if 0
_mm_store_ps(rcb->dest, _mm_add_ps(_mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1, 0, 3, 2)), // s2_1 s2_0 s1_3 s1_2
_mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3, 2, 1, 0)) // s2_3 s2_2 s1_1 s1_0
));
#endif
// Since we're decimating we only need one of the sums
out[k++] = (IQ_swap) ? rcb->dest[0] : rcb->dest[1];
out[k++] = (IQ_swap) ? rcb->dest[1] : rcb->dest[0];
#else // non SIMD
for(j = 0; j < count; j += 2) {
pSrc = buf;
if(1 == pass)
pFil = mcb->coeff_768;
else {
switch(mcb->output_rate) {
case 48000:
pFil = mcb->coeff_48;
break;
case 96000:
pFil = mcb->coeff_96;
break;
case 192000:
pFil = mcb->coeff_192;
break;
case 384000:
pFil = mcb->coeff_384;
break;
}
}
sum1 = sum2 = 0.0f;
for(i = 0; i < coeff_len; i++) {
sum1 += pSrc[0] * pFil[0];
sum2 += pSrc[1] * pFil[0];
pSrc += 2;
pFil++;
}
buf += 2;
if(0 == (((j + 2) / 2) % dsample)) {
out[k++] = (IQ_swap) ? sum2 : sum1;
out[k++] = (IQ_swap) ? sum1 : sum2;
#endif
}
}
// Move the last coeff_len*2 length of buffer to the front for the next call
memmove(orig_buf, &orig_buf[count - (coeff_len * 2)],
coeff_len * 2 * sizeof(float));
}
void
downsample(struct rcvr_cb * rcb) {
down_loop(rcb, 1);
down_loop(rcb, 2);
}