CMSIS-DSP: Added f16 matrix functions
parent
7d79ffa51f
commit
d2d691cc23
@ -0,0 +1,40 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: MatrixFunctions.c
|
||||
* Description: Combination of all matrix function f16 source files.
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.0.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "arm_mat_add_f16.c"
|
||||
#include "arm_mat_sub_f16.c"
|
||||
#include "arm_mat_trans_f16.c"
|
||||
#include "arm_mat_scale_f16.c"
|
||||
#include "arm_mat_mult_f16.c"
|
||||
#include "arm_mat_vec_mult_f16.c"
|
||||
#include "arm_mat_cmplx_trans_f16.c"
|
||||
#include "arm_mat_cmplx_mult_f16.c"
|
||||
#include "arm_mat_inverse_f16.c"
|
||||
#include "arm_mat_init_f16.c"
|
||||
|
||||
|
||||
@ -0,0 +1,217 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_add_f16.c
|
||||
* Description: Floating-point matrix addition
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@addtogroup MatrixAdd
|
||||
@{
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@brief Floating-point matrix addition.
|
||||
@param[in] pSrcA points to first input matrix structure
|
||||
@param[in] pSrcB points to second input matrix structure
|
||||
@param[out] pDst points to output matrix structure
|
||||
@return execution status
|
||||
- \ref ARM_MATH_SUCCESS : Operation successful
|
||||
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
|
||||
*/
|
||||
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
arm_status arm_mat_add_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
arm_status status;
|
||||
uint32_t numSamples; /* total number of elements in the matrix */
|
||||
float16_t *pDataA, *pDataB, *pDataDst;
|
||||
f16x8_t vecA, vecB, vecDst;
|
||||
float16_t const *pSrcAVec;
|
||||
float16_t const *pSrcBVec;
|
||||
uint32_t blkCnt; /* loop counters */
|
||||
|
||||
pDataA = pSrcA->pData;
|
||||
pDataB = pSrcB->pData;
|
||||
pDataDst = pDst->pData;
|
||||
pSrcAVec = (float16_t const *) pDataA;
|
||||
pSrcBVec = (float16_t const *) pDataB;
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numRows != pSrcB->numRows) ||
|
||||
(pSrcA->numCols != pSrcB->numCols) ||
|
||||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/*
|
||||
* Total number of samples in the input matrix
|
||||
*/
|
||||
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
|
||||
blkCnt = numSamples >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) + B(m,n) */
|
||||
/* Add and then store the results in the destination buffer. */
|
||||
vecA = vld1q(pSrcAVec);
|
||||
pSrcAVec += 8;
|
||||
vecB = vld1q(pSrcBVec);
|
||||
pSrcBVec += 8;
|
||||
vecDst = vaddq(vecA, vecB);
|
||||
vst1q(pDataDst, vecDst);
|
||||
pDataDst += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
*/
|
||||
blkCnt = numSamples & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
vecA = vld1q(pSrcAVec);
|
||||
vecB = vld1q(pSrcBVec);
|
||||
vecDst = vaddq_m(vecDst, vecA, vecB, p0);
|
||||
vstrhq_p(pDataDst, vecDst, p0);
|
||||
}
|
||||
/* set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
return (status);
|
||||
}
|
||||
#else
|
||||
|
||||
arm_status arm_mat_add_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
|
||||
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
|
||||
uint32_t numSamples; /* total number of elements in the matrix */
|
||||
uint32_t blkCnt; /* loop counters */
|
||||
arm_status status; /* status of matrix addition */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numRows != pSrcB->numRows) ||
|
||||
(pSrcA->numCols != pSrcB->numCols) ||
|
||||
(pSrcA->numRows != pDst->numRows) ||
|
||||
(pSrcA->numCols != pDst->numCols) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
/* Total number of samples in input matrix */
|
||||
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
blkCnt = numSamples >> 2U;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) + B(m,n) */
|
||||
|
||||
/* Add and store result in destination buffer. */
|
||||
*pOut++ = *pInA++ + *pInB++;
|
||||
|
||||
*pOut++ = *pInA++ + *pInB++;
|
||||
|
||||
*pOut++ = *pInA++ + *pInB++;
|
||||
|
||||
*pOut++ = *pInA++ + *pInB++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
blkCnt = numSamples % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of samples */
|
||||
blkCnt = numSamples;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) + B(m,n) */
|
||||
|
||||
/* Add and store result in destination buffer. */
|
||||
*pOut++ = *pInA++ + *pInB++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
@} end of MatrixAdd group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,930 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_cmplx_mult_f16.c
|
||||
* Description: Floating-point matrix multiplication
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@addtogroup CmplxMatrixMult
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Floating-point Complex matrix multiplication.
|
||||
@param[in] pSrcA points to first input complex matrix structure
|
||||
@param[in] pSrcB points to second input complex matrix structure
|
||||
@param[out] pDst points to output complex matrix structure
|
||||
@return execution status
|
||||
- \ref ARM_MATH_SUCCESS : Operation successful
|
||||
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
#include <stdio.h>
|
||||
#include "arm_helium_utils.h"
|
||||
|
||||
#define DONTCARE 0 /* inactive lane content */
|
||||
|
||||
|
||||
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_2x2_mve(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
const uint16_t MATRIX_DIM = 2;
|
||||
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
|
||||
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
uint16x8_t vecColBOffs0,vecColAOffs0,vecColAOffs1;
|
||||
float16_t *pInA0 = pInA;
|
||||
f16x8_t acc0, acc1;
|
||||
f16x8_t vecB, vecA0, vecA1;
|
||||
f16x8_t vecTmp;
|
||||
uint16_t tmp;
|
||||
static const uint16_t offsetB0[8] = { 0, 1,
|
||||
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
|
||||
2, 3,
|
||||
MATRIX_DIM * CMPLX_DIM + 2 , MATRIX_DIM * CMPLX_DIM + 3,
|
||||
};
|
||||
|
||||
|
||||
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
|
||||
|
||||
tmp = 0;
|
||||
vecColAOffs0 = viwdupq_u16(tmp, 4, 1);
|
||||
|
||||
tmp = (CMPLX_DIM * MATRIX_DIM);
|
||||
vecColAOffs1 = vecColAOffs0 + (uint16_t)(CMPLX_DIM * MATRIX_DIM);
|
||||
|
||||
|
||||
pInB = (float16_t const *)pSrcB->pData;
|
||||
|
||||
vecA0 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs0);
|
||||
vecA1 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs1);
|
||||
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
|
||||
|
||||
acc0 = vcmulq(vecA0, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
|
||||
|
||||
acc1 = vcmulq(vecA1, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
|
||||
|
||||
|
||||
/*
|
||||
* Compute
|
||||
* re0+re1 | im0+im1 | re0+re1 | im0+im1
|
||||
* re2+re3 | im2+im3 | re2+re3 | im2+im3
|
||||
*/
|
||||
|
||||
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc0);
|
||||
vecTmp = vaddq(vecTmp, acc0);
|
||||
|
||||
|
||||
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
|
||||
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
|
||||
|
||||
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc1);
|
||||
vecTmp = vaddq(vecTmp, acc1);
|
||||
|
||||
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
|
||||
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
|
||||
|
||||
/*
|
||||
* Return to application
|
||||
*/
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_3x3_mve(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
const uint16_t MATRIX_DIM = 3;
|
||||
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
|
||||
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
uint16x8_t vecColBOffs0;
|
||||
float16_t *pInA0 = pInA;
|
||||
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
|
||||
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
|
||||
f16x8_t acc0, acc1, acc2;
|
||||
f16x8_t vecB, vecA0, vecA1, vecA2;
|
||||
static const uint16_t offsetB0[8] = { 0, 1,
|
||||
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
|
||||
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
|
||||
DONTCARE, DONTCARE
|
||||
};
|
||||
|
||||
|
||||
/* enable predication to disable upper half complex vector element */
|
||||
mve_pred16_t p0 = vctp16q(MATRIX_DIM * CMPLX_DIM);
|
||||
|
||||
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
|
||||
|
||||
pInB = (float16_t const *)pSrcB->pData;
|
||||
|
||||
vecA0 = vldrhq_f16(pInA0);
|
||||
vecA1 = vldrhq_f16(pInA1);
|
||||
vecA2 = vldrhq_f16(pInA2);
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
|
||||
|
||||
acc0 = vcmulq(vecA0, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
|
||||
|
||||
acc1 = vcmulq(vecA1, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
|
||||
|
||||
acc2 = vcmulq(vecA2, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
|
||||
pOut += CMPLX_DIM;
|
||||
/*
|
||||
* move to next B column
|
||||
*/
|
||||
pInB = pInB + CMPLX_DIM;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
|
||||
|
||||
acc0 = vcmulq(vecA0, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
|
||||
|
||||
acc1 = vcmulq(vecA1, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
|
||||
|
||||
acc2 = vcmulq(vecA2, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
|
||||
pOut += CMPLX_DIM;
|
||||
/*
|
||||
* move to next B column
|
||||
*/
|
||||
pInB = pInB + CMPLX_DIM;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
|
||||
|
||||
acc0 = vcmulq(vecA0, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
|
||||
|
||||
acc1 = vcmulq(vecA1, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
|
||||
|
||||
acc2 = vcmulq(vecA2, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
|
||||
/*
|
||||
* Return to application
|
||||
*/
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_4x4_mve(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
const uint16_t MATRIX_DIM = 4;
|
||||
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
|
||||
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
uint16x8_t vecColBOffs0;
|
||||
float16_t *pInA0 = pInA;
|
||||
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
|
||||
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
|
||||
float16_t *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM;
|
||||
f16x8_t acc0, acc1, acc2, acc3;
|
||||
f16x8_t vecB, vecA;
|
||||
static const uint16_t offsetB0[8] = { 0, 1,
|
||||
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
|
||||
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
|
||||
3 * MATRIX_DIM * CMPLX_DIM, 3 * MATRIX_DIM * CMPLX_DIM + 1
|
||||
};
|
||||
|
||||
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
|
||||
|
||||
pInB = (float16_t const *)pSrcB->pData;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
|
||||
|
||||
vecA = vldrhq_f16(pInA0);
|
||||
acc0 = vcmulq(vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA1);
|
||||
acc1 = vcmulq(vecA, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA2);
|
||||
acc2 = vcmulq(vecA, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA3);
|
||||
acc3 = vcmulq(vecA, vecB);
|
||||
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
|
||||
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
|
||||
pOut += CMPLX_DIM;
|
||||
/*
|
||||
* move to next B column
|
||||
*/
|
||||
pInB = pInB + CMPLX_DIM;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
|
||||
|
||||
vecA = vldrhq_f16(pInA0);
|
||||
acc0 = vcmulq(vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA1);
|
||||
acc1 = vcmulq(vecA, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA2);
|
||||
acc2 = vcmulq(vecA, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA3);
|
||||
acc3 = vcmulq(vecA, vecB);
|
||||
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
|
||||
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
|
||||
pOut += CMPLX_DIM;
|
||||
/*
|
||||
* move to next B column
|
||||
*/
|
||||
pInB = pInB + CMPLX_DIM;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
|
||||
|
||||
vecA = vldrhq_f16(pInA0);
|
||||
acc0 = vcmulq(vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA1);
|
||||
acc1 = vcmulq(vecA, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA2);
|
||||
acc2 = vcmulq(vecA, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA3);
|
||||
acc3 = vcmulq(vecA, vecB);
|
||||
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
|
||||
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
|
||||
pOut += CMPLX_DIM;
|
||||
/*
|
||||
* move to next B column
|
||||
*/
|
||||
pInB = pInB + CMPLX_DIM;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
|
||||
|
||||
vecA = vldrhq_f16(pInA0);
|
||||
acc0 = vcmulq(vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA1);
|
||||
acc1 = vcmulq(vecA, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA2);
|
||||
acc2 = vcmulq(vecA, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
|
||||
|
||||
vecA = vldrhq_f16(pInA3);
|
||||
acc3 = vcmulq(vecA, vecB);
|
||||
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
|
||||
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
|
||||
/*
|
||||
* Return to application
|
||||
*/
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
arm_status arm_mat_cmplx_mult_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t const *pInB = (float16_t const *) pSrcB->pData; /* input data matrix pointer B */
|
||||
float16_t const *pInA = (float16_t const *) pSrcA->pData; /* input data matrix pointer A */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
float16_t *px; /* Temporary output data matrix pointer */
|
||||
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
|
||||
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
|
||||
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
|
||||
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
|
||||
arm_status status; /* status of matrix multiplication */
|
||||
uint16x8_t vecOffs, vecColBOffs;
|
||||
uint32_t blkCnt,rowCnt; /* loop counters */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numCols != pSrcB->numRows) ||
|
||||
(pSrcA->numRows != pDst->numRows) ||
|
||||
(pSrcB->numCols != pDst->numCols) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
|
||||
/*
|
||||
* small squared matrix specialized routines
|
||||
*/
|
||||
if (numRowsA == numColsB && numColsB == numColsA)
|
||||
{
|
||||
if (numRowsA == 1)
|
||||
{
|
||||
pOut[0] = pInA[0] * pInB[0] - pInA[1] * pInB[1];
|
||||
pOut[1] = pInA[0] * pInB[1] + pInA[1] * pInB[0];
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
else if (numRowsA == 2)
|
||||
return arm_mat_cmplx_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
|
||||
else if (numRowsA == 3)
|
||||
return arm_mat_cmplx_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
|
||||
else if (numRowsA == 4)
|
||||
return arm_mat_cmplx_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
|
||||
}
|
||||
|
||||
vecColBOffs[0] = 0;
|
||||
vecColBOffs[1] = 1;
|
||||
vecColBOffs[2] = numColsB * CMPLX_DIM;
|
||||
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
|
||||
vecColBOffs[4] = 2*numColsB * CMPLX_DIM;
|
||||
vecColBOffs[5] = 2*(numColsB * CMPLX_DIM) + 1;
|
||||
vecColBOffs[6] = 3*numColsB * CMPLX_DIM;
|
||||
vecColBOffs[7] = 3*(numColsB * CMPLX_DIM) + 1;
|
||||
|
||||
/*
|
||||
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
|
||||
*/
|
||||
|
||||
/*
|
||||
* row loop
|
||||
*/
|
||||
rowCnt = row >> 2;
|
||||
while (rowCnt > 0u)
|
||||
{
|
||||
/*
|
||||
* Output pointer is set to starting address of the row being processed
|
||||
*/
|
||||
px = pOut + i * CMPLX_DIM;
|
||||
i = i + 4 * numColsB;
|
||||
/*
|
||||
* For every row wise process, the column loop counter is to be initiated
|
||||
*/
|
||||
col = numColsB;
|
||||
/*
|
||||
* For every row wise process, the pInB pointer is set
|
||||
* to the starting address of the pSrcB data
|
||||
*/
|
||||
pInB = (float16_t const *) pSrcB->pData;
|
||||
/*
|
||||
* column loop
|
||||
*/
|
||||
while (col > 0u)
|
||||
{
|
||||
/*
|
||||
* generate 4 columns elements
|
||||
*/
|
||||
/*
|
||||
* Matrix A columns number of MAC operations are to be performed
|
||||
*/
|
||||
colCnt = numColsA;
|
||||
|
||||
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
|
||||
float16_t const *pInA0 = pInA;
|
||||
float16_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
|
||||
float16_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
|
||||
float16_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
|
||||
f16x8_t acc0, acc1, acc2, acc3;
|
||||
|
||||
acc0 = vdupq_n_f16(0.0f16);
|
||||
acc1 = vdupq_n_f16(0.0f16);
|
||||
acc2 = vdupq_n_f16(0.0f16);
|
||||
acc3 = vdupq_n_f16(0.0f16);
|
||||
|
||||
pSrcA0Vec = (float16_t const *) pInA0;
|
||||
pSrcA1Vec = (float16_t const *) pInA1;
|
||||
pSrcA2Vec = (float16_t const *) pInA2;
|
||||
pSrcA3Vec = (float16_t const *) pInA3;
|
||||
|
||||
vecOffs = vecColBOffs;
|
||||
|
||||
/*
|
||||
* process 1 x 4 block output
|
||||
*/
|
||||
blkCnt = (numColsA * CMPLX_DIM) >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vecB, vecA;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset_f16(pInB, vecOffs);
|
||||
/*
|
||||
* move Matrix B read offsets, 4 rows down
|
||||
*/
|
||||
vecOffs = vaddq_n_u16(vecOffs , (uint16_t) (numColsB * 4 * CMPLX_DIM));
|
||||
|
||||
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
|
||||
acc0 = vcmlaq(acc0, vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
|
||||
acc1 = vcmlaq(acc1, vecA, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
|
||||
|
||||
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
|
||||
acc2 = vcmlaq(acc2, vecA, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
|
||||
|
||||
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
|
||||
acc3 = vcmlaq(acc3, vecA, vecB);
|
||||
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
|
||||
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* Unsupported addressing mode compiler crash
|
||||
*/
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = (numColsA * CMPLX_DIM) & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
f16x8_t vecB, vecA;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset_z_f16(pInB, vecOffs, p0);
|
||||
/*
|
||||
* move Matrix B read offsets, 4 rows down
|
||||
*/
|
||||
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
|
||||
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
acc0 = vcmlaq(acc0, vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
vecA = vld1q(pSrcA1Vec);
|
||||
acc1 = vcmlaq(acc1, vecA, vecB);
|
||||
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
|
||||
|
||||
vecA = vld1q(pSrcA2Vec);
|
||||
acc2 = vcmlaq(acc2, vecA, vecB);
|
||||
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
|
||||
|
||||
vecA = vld1q(pSrcA3Vec);
|
||||
acc3 = vcmlaq(acc3, vecA, vecB);
|
||||
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
|
||||
|
||||
}
|
||||
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &px[0 * CMPLX_DIM * numColsB + 0]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc1, &px[1 * CMPLX_DIM * numColsB + 0]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc2, &px[2 * CMPLX_DIM * numColsB + 0]);
|
||||
mve_cmplx_sum_intra_vec_f16(acc3, &px[3 * CMPLX_DIM * numColsB + 0]);
|
||||
|
||||
px += CMPLX_DIM;
|
||||
/*
|
||||
* Decrement the column loop counter
|
||||
*/
|
||||
col--;
|
||||
/*
|
||||
* Update the pointer pInB to point to the starting address of the next column
|
||||
*/
|
||||
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the pointer pInA to point to the starting address of the next row
|
||||
*/
|
||||
pInA += (numColsA * 4) * CMPLX_DIM;
|
||||
/*
|
||||
* Decrement the row loop counter
|
||||
*/
|
||||
rowCnt --;
|
||||
|
||||
}
|
||||
|
||||
rowCnt = row & 3;
|
||||
while (rowCnt > 0u)
|
||||
{
|
||||
/*
|
||||
* Output pointer is set to starting address of the row being processed
|
||||
*/
|
||||
px = pOut + i * CMPLX_DIM;
|
||||
i = i + numColsB;
|
||||
/*
|
||||
* For every row wise process, the column loop counter is to be initiated
|
||||
*/
|
||||
col = numColsB;
|
||||
/*
|
||||
* For every row wise process, the pInB pointer is set
|
||||
* to the starting address of the pSrcB data
|
||||
*/
|
||||
pInB = (float16_t const *) pSrcB->pData;
|
||||
/*
|
||||
* column loop
|
||||
*/
|
||||
while (col > 0u)
|
||||
{
|
||||
/*
|
||||
* generate 4 columns elements
|
||||
*/
|
||||
/*
|
||||
* Matrix A columns number of MAC operations are to be performed
|
||||
*/
|
||||
colCnt = numColsA;
|
||||
|
||||
float16_t const *pSrcA0Vec;
|
||||
float16_t const *pInA0 = pInA;
|
||||
f16x8_t acc0;
|
||||
|
||||
acc0 = vdupq_n_f16(0.0f16);
|
||||
|
||||
pSrcA0Vec = (float16_t const *) pInA0;
|
||||
|
||||
vecOffs = vecColBOffs;
|
||||
|
||||
/*
|
||||
* process 1 x 4 block output
|
||||
*/
|
||||
blkCnt = (numColsA * CMPLX_DIM) >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vecB, vecA;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
|
||||
/*
|
||||
* move Matrix B read offsets, 4 rows down
|
||||
*/
|
||||
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (4*numColsB * CMPLX_DIM));
|
||||
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
pSrcA0Vec += 8;
|
||||
acc0 = vcmlaq(acc0, vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* tail
|
||||
*/
|
||||
blkCnt = (numColsA * CMPLX_DIM) & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
f16x8_t vecB, vecA;
|
||||
|
||||
vecB = vldrhq_gather_shifted_offset_z(pInB, vecOffs, p0);
|
||||
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
acc0 = vcmlaq(acc0, vecA, vecB);
|
||||
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
|
||||
|
||||
}
|
||||
|
||||
mve_cmplx_sum_intra_vec_f16(acc0, &px[0]);
|
||||
|
||||
|
||||
px += CMPLX_DIM;
|
||||
/*
|
||||
* Decrement the column loop counter
|
||||
*/
|
||||
col--;
|
||||
/*
|
||||
* Update the pointer pInB to point to the starting address of the next column
|
||||
*/
|
||||
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the pointer pInA to point to the starting address of the next row
|
||||
*/
|
||||
pInA += numColsA * CMPLX_DIM;
|
||||
rowCnt--;
|
||||
}
|
||||
|
||||
/*
|
||||
* set status as ARM_MATH_SUCCESS
|
||||
*/
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
/*
|
||||
* Return to application
|
||||
*/
|
||||
return (status);
|
||||
}
|
||||
#else
|
||||
|
||||
arm_status arm_mat_cmplx_mult_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
|
||||
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
|
||||
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
|
||||
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
|
||||
float16_t *px; /* Temporary output data matrix pointer */
|
||||
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
|
||||
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
|
||||
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
|
||||
float16_t sumReal, sumImag; /* Accumulator */
|
||||
float16_t a1, b1, c1, d1;
|
||||
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
|
||||
arm_status status; /* status of matrix multiplication */
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
float16_t a0, b0, c0, d0;
|
||||
#endif
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numCols != pSrcB->numRows) ||
|
||||
(pSrcA->numRows != pDst->numRows) ||
|
||||
(pSrcB->numCols != pDst->numCols) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
|
||||
/* row loop */
|
||||
do
|
||||
{
|
||||
/* Output pointer is set to starting address of the row being processed */
|
||||
px = pOut + 2 * i;
|
||||
|
||||
/* For every row wise process, the column loop counter is to be initiated */
|
||||
col = numColsB;
|
||||
|
||||
/* For every row wise process, the pIn2 pointer is set
|
||||
** to the starting address of the pSrcB data */
|
||||
pIn2 = pSrcB->pData;
|
||||
|
||||
j = 0U;
|
||||
|
||||
/* column loop */
|
||||
do
|
||||
{
|
||||
/* Set the variable sum, that acts as accumulator, to zero */
|
||||
sumReal = 0.0f;
|
||||
sumImag = 0.0f;
|
||||
|
||||
/* Initiate pointer pIn1 to point to starting address of column being processed */
|
||||
pIn1 = pInA;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Apply loop unrolling and compute 4 MACs simultaneously. */
|
||||
colCnt = numColsA >> 2U;
|
||||
|
||||
/* matrix multiplication */
|
||||
while (colCnt > 0U)
|
||||
{
|
||||
|
||||
/* Reading real part of complex matrix A */
|
||||
a0 = *pIn1;
|
||||
|
||||
/* Reading real part of complex matrix B */
|
||||
c0 = *pIn2;
|
||||
|
||||
/* Reading imaginary part of complex matrix A */
|
||||
b0 = *(pIn1 + 1U);
|
||||
|
||||
/* Reading imaginary part of complex matrix B */
|
||||
d0 = *(pIn2 + 1U);
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal += a0 * c0;
|
||||
sumImag += b0 * c0;
|
||||
|
||||
/* update pointers */
|
||||
pIn1 += 2U;
|
||||
pIn2 += 2 * numColsB;
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal -= b0 * d0;
|
||||
sumImag += a0 * d0;
|
||||
|
||||
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
|
||||
|
||||
/* read real and imag values from pSrcA and pSrcB buffer */
|
||||
a1 = *(pIn1 );
|
||||
c1 = *(pIn2 );
|
||||
b1 = *(pIn1 + 1U);
|
||||
d1 = *(pIn2 + 1U);
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal += a1 * c1;
|
||||
sumImag += b1 * c1;
|
||||
|
||||
/* update pointers */
|
||||
pIn1 += 2U;
|
||||
pIn2 += 2 * numColsB;
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal -= b1 * d1;
|
||||
sumImag += a1 * d1;
|
||||
|
||||
a0 = *(pIn1 );
|
||||
c0 = *(pIn2 );
|
||||
b0 = *(pIn1 + 1U);
|
||||
d0 = *(pIn2 + 1U);
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal += a0 * c0;
|
||||
sumImag += b0 * c0;
|
||||
|
||||
/* update pointers */
|
||||
pIn1 += 2U;
|
||||
pIn2 += 2 * numColsB;
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal -= b0 * d0;
|
||||
sumImag += a0 * d0;
|
||||
|
||||
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
|
||||
|
||||
a1 = *(pIn1 );
|
||||
c1 = *(pIn2 );
|
||||
b1 = *(pIn1 + 1U);
|
||||
d1 = *(pIn2 + 1U);
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal += a1 * c1;
|
||||
sumImag += b1 * c1;
|
||||
|
||||
/* update pointers */
|
||||
pIn1 += 2U;
|
||||
pIn2 += 2 * numColsB;
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal -= b1 * d1;
|
||||
sumImag += a1 * d1;
|
||||
|
||||
/* Decrement loop count */
|
||||
colCnt--;
|
||||
}
|
||||
|
||||
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
|
||||
** No loop unrolling is used. */
|
||||
colCnt = numColsA % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of samples */
|
||||
colCnt = numColsA;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (colCnt > 0U)
|
||||
{
|
||||
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
|
||||
a1 = *(pIn1 );
|
||||
c1 = *(pIn2 );
|
||||
b1 = *(pIn1 + 1U);
|
||||
d1 = *(pIn2 + 1U);
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal += a1 * c1;
|
||||
sumImag += b1 * c1;
|
||||
|
||||
/* update pointers */
|
||||
pIn1 += 2U;
|
||||
pIn2 += 2 * numColsB;
|
||||
|
||||
/* Multiply and Accumlates */
|
||||
sumReal -= b1 * d1;
|
||||
sumImag += a1 * d1;
|
||||
|
||||
/* Decrement loop counter */
|
||||
colCnt--;
|
||||
}
|
||||
|
||||
/* Store result in destination buffer */
|
||||
*px++ = sumReal;
|
||||
*px++ = sumImag;
|
||||
|
||||
/* Update pointer pIn2 to point to starting address of next column */
|
||||
j++;
|
||||
pIn2 = pSrcB->pData + 2U * j;
|
||||
|
||||
/* Decrement column loop counter */
|
||||
col--;
|
||||
|
||||
} while (col > 0U);
|
||||
|
||||
/* Update pointer pInA to point to starting address of next row */
|
||||
i = i + numColsB;
|
||||
pInA = pInA + 2 * numColsA;
|
||||
|
||||
/* Decrement row loop counter */
|
||||
row--;
|
||||
|
||||
} while (row > 0U);
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
@} end of MatrixMult group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,131 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_cmplx_trans_f16.c
|
||||
* Description: Floating-point complex matrix transpose
|
||||
*
|
||||
* $Date: 08. July 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@addtogroup MatrixComplexTrans
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Floating-point matrix transpose.
|
||||
@param[in] pSrc points to input matrix
|
||||
@param[out] pDst points to output matrix
|
||||
@return execution status
|
||||
- \ref ARM_MATH_SUCCESS : Operation successful
|
||||
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
#include "arm_helium_utils.h"
|
||||
|
||||
arm_status arm_mat_cmplx_trans_f16(const arm_matrix_instance_f16 * pSrc, arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
|
||||
pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
|
||||
}
|
||||
|
||||
#else
|
||||
arm_status arm_mat_cmplx_trans_f16(
|
||||
const arm_matrix_instance_f16 * pSrc,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
float16_t *px; /* Temporary output data matrix pointer */
|
||||
uint16_t nRows = pSrc->numRows; /* number of rows */
|
||||
uint16_t nColumns = pSrc->numCols; /* number of columns */
|
||||
uint16_t col, i = 0U, row = nRows; /* loop counters */
|
||||
arm_status status; /* status of matrix transpose */
|
||||
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
/* Matrix transpose by exchanging the rows with columns */
|
||||
/* row loop */
|
||||
do
|
||||
{
|
||||
/* The pointer px is set to starting address of the column being processed */
|
||||
px = pOut + CMPLX_DIM * i;
|
||||
|
||||
/* Initialize column loop counter */
|
||||
col = nColumns;
|
||||
|
||||
while (col > 0U)
|
||||
{
|
||||
/* Read and store the input element in the destination */
|
||||
px[0] = *pIn++; // real
|
||||
px[1] = *pIn++; // imag
|
||||
|
||||
/* Update the pointer px to point to the next row of the transposed matrix */
|
||||
px += CMPLX_DIM * nRows;
|
||||
|
||||
/* Decrement the column loop counter */
|
||||
col--;
|
||||
}
|
||||
i++;
|
||||
|
||||
/* Decrement the row loop counter */
|
||||
row--;
|
||||
|
||||
} while (row > 0U); /* row loop end */
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
* @} end of MatrixTrans group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_init_f16.c
|
||||
* Description: Floating-point matrix initialization
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@addtogroup MatrixInit
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Floating-point matrix initialization.
|
||||
@param[in,out] S points to an instance of the floating-point matrix structure
|
||||
@param[in] nRows number of rows in the matrix
|
||||
@param[in] nColumns number of columns in the matrix
|
||||
@param[in] pData points to the matrix data array
|
||||
@return none
|
||||
*/
|
||||
|
||||
void arm_mat_init_f16(
|
||||
arm_matrix_instance_f16 * S,
|
||||
uint16_t nRows,
|
||||
uint16_t nColumns,
|
||||
float16_t * pData)
|
||||
{
|
||||
/* Assign Number of Rows */
|
||||
S->numRows = nRows;
|
||||
|
||||
/* Assign Number of Columns */
|
||||
S->numCols = nColumns;
|
||||
|
||||
/* Assign Data pointer */
|
||||
S->pData = pData;
|
||||
}
|
||||
|
||||
/**
|
||||
@} end of MatrixInit group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,904 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_inverse_f16.c
|
||||
* Description: Floating-point matrix inverse
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@addtogroup MatrixInv
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Floating-point matrix inverse.
|
||||
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
|
||||
@param[out] pDst points to output matrix structure
|
||||
@return execution status
|
||||
- \ref ARM_MATH_SUCCESS : Operation successful
|
||||
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
|
||||
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
arm_status arm_mat_inverse_f16(
|
||||
const arm_matrix_instance_f16 * pSrc,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
|
||||
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
|
||||
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
|
||||
|
||||
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
|
||||
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
|
||||
float16_t *pTmpA, *pTmpB;
|
||||
|
||||
float16_t in = 0.0f; /* Temporary input values */
|
||||
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
|
||||
arm_status status; /* status of matrix inverse */
|
||||
uint32_t blkCnt;
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|
||||
|| (pSrc->numRows != pDst->numRows))
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
{
|
||||
|
||||
/*--------------------------------------------------------------------------------------------------------------
|
||||
* Matrix Inverse can be solved using elementary row operations.
|
||||
*
|
||||
* Gauss-Jordan Method:
|
||||
*
|
||||
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
|
||||
* augmented matrix as follows:
|
||||
* _ _ _ _ _ _ _ _
|
||||
* | | a11 a12 | | | 1 0 | | | X11 X12 |
|
||||
* | | | | | | | = | |
|
||||
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
|
||||
*
|
||||
* 2. In our implementation, pDst Matrix is used as identity matrix.
|
||||
*
|
||||
* 3. Begin with the first row. Let i = 1.
|
||||
*
|
||||
* 4. Check to see if the pivot for row i is zero.
|
||||
* The pivot is the element of the main diagonal that is on the current row.
|
||||
* For instance, if working with row i, then the pivot element is aii.
|
||||
* If the pivot is zero, exchange that row with a row below it that does not
|
||||
* contain a zero in column i. If this is not possible, then an inverse
|
||||
* to that matrix does not exist.
|
||||
*
|
||||
* 5. Divide every element of row i by the pivot.
|
||||
*
|
||||
* 6. For every row below and row i, replace that row with the sum of that row and
|
||||
* a multiple of row i so that each new element in column i below row i is zero.
|
||||
*
|
||||
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
|
||||
* for every element below and above the main diagonal.
|
||||
*
|
||||
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
|
||||
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
|
||||
*----------------------------------------------------------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* Working pointer for destination matrix
|
||||
*/
|
||||
pOutT1 = pOut;
|
||||
/*
|
||||
* Loop over the number of rows
|
||||
*/
|
||||
rowCnt = numRows;
|
||||
/*
|
||||
* Making the destination matrix as identity matrix
|
||||
*/
|
||||
while (rowCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* Writing all zeroes in lower triangle of the destination matrix
|
||||
*/
|
||||
j = numRows - rowCnt;
|
||||
while (j > 0U)
|
||||
{
|
||||
*pOutT1++ = 0.0f;
|
||||
j--;
|
||||
}
|
||||
/*
|
||||
* Writing all ones in the diagonal of the destination matrix
|
||||
*/
|
||||
*pOutT1++ = 1.0f;
|
||||
/*
|
||||
* Writing all zeroes in upper triangle of the destination matrix
|
||||
*/
|
||||
j = rowCnt - 1U;
|
||||
while (j > 0U)
|
||||
{
|
||||
*pOutT1++ = 0.0f;
|
||||
j--;
|
||||
}
|
||||
/*
|
||||
* Decrement the loop counter
|
||||
*/
|
||||
rowCnt--;
|
||||
}
|
||||
|
||||
/*
|
||||
* Loop over the number of columns of the input matrix.
|
||||
* All the elements in each column are processed by the row operations
|
||||
*/
|
||||
loopCnt = numCols;
|
||||
/*
|
||||
* Index modifier to navigate through the columns
|
||||
*/
|
||||
l = 0U;
|
||||
while (loopCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* Check if the pivot element is zero..
|
||||
* If it is zero then interchange the row with non zero row below.
|
||||
* If there is no non zero element to replace in the rows below,
|
||||
* then the matrix is Singular.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Working pointer for the input matrix that points
|
||||
* * to the pivot element of the particular row
|
||||
*/
|
||||
pInT1 = pIn + (l * numCols);
|
||||
/*
|
||||
* Working pointer for the destination matrix that points
|
||||
* * to the pivot element of the particular row
|
||||
*/
|
||||
pOutT1 = pOut + (l * numCols);
|
||||
/*
|
||||
* Temporary variable to hold the pivot value
|
||||
*/
|
||||
in = *pInT1;
|
||||
/*
|
||||
* Destination pointer modifier
|
||||
*/
|
||||
k = 1U;
|
||||
|
||||
/*
|
||||
* Check if the pivot element is zero
|
||||
*/
|
||||
if (*pInT1 == 0.0f)
|
||||
{
|
||||
/*
|
||||
* Loop over the number rows present below
|
||||
*/
|
||||
for (i = (l + 1U); i < numRows; i++)
|
||||
{
|
||||
/*
|
||||
* Update the input and destination pointers
|
||||
*/
|
||||
pInT2 = pInT1 + (numCols * i);
|
||||
pOutT2 = pOutT1 + (numCols * k);
|
||||
/*
|
||||
* Check if there is a non zero pivot element to
|
||||
* * replace in the rows below
|
||||
*/
|
||||
if (*pInT2 != 0.0f)
|
||||
{
|
||||
f16x8_t vecA, vecB;
|
||||
/*
|
||||
* Loop over number of columns
|
||||
* * to the right of the pilot element
|
||||
*/
|
||||
pTmpA = pInT1;
|
||||
pTmpB = pInT2;
|
||||
blkCnt = (numCols - l) >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
|
||||
vecA = vldrhq_f16(pTmpA);
|
||||
vecB = vldrhq_f16(pTmpB);
|
||||
vstrhq_f16(pTmpB, vecA);
|
||||
vstrhq_f16(pTmpA, vecB);
|
||||
|
||||
pTmpA += 8;
|
||||
pTmpB += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = (numCols - l) & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
|
||||
vecA = vldrhq_f16(pTmpA);
|
||||
vecB = vldrhq_f16(pTmpB);
|
||||
vstrhq_p_f16(pTmpB, vecA, p0);
|
||||
vstrhq_p_f16(pTmpA, vecB, p0);
|
||||
}
|
||||
|
||||
pInT1 += numCols - l;
|
||||
pInT2 += numCols - l;
|
||||
pTmpA = pOutT1;
|
||||
pTmpB = pOutT2;
|
||||
blkCnt = numCols >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
|
||||
vecA = vldrhq_f16(pTmpA);
|
||||
vecB = vldrhq_f16(pTmpB);
|
||||
vstrhq_f16(pTmpB, vecA);
|
||||
vstrhq_f16(pTmpA, vecB);
|
||||
pTmpA += 8;
|
||||
pTmpB += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
*/
|
||||
blkCnt = numCols & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
|
||||
vecA = vldrhq_f16(pTmpA);
|
||||
vecB = vldrhq_f16(pTmpB);
|
||||
vstrhq_p_f16(pTmpB, vecA, p0);
|
||||
vstrhq_p_f16(pTmpA, vecB, p0);
|
||||
}
|
||||
|
||||
pOutT1 += numCols;
|
||||
pOutT2 += numCols;
|
||||
/*
|
||||
* Flag to indicate whether exchange is done or not
|
||||
*/
|
||||
flag = 1U;
|
||||
|
||||
/*
|
||||
* Break after exchange is done
|
||||
*/
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* Update the destination pointer modifier
|
||||
*/
|
||||
k++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the status if the matrix is singular
|
||||
*/
|
||||
if ((flag != 1U) && (in == 0.0f))
|
||||
{
|
||||
return ARM_MATH_SINGULAR;
|
||||
}
|
||||
|
||||
/*
|
||||
* Points to the pivot row of input and destination matrices
|
||||
*/
|
||||
pPivotRowIn = pIn + (l * numCols);
|
||||
pPivotRowDst = pOut + (l * numCols);
|
||||
|
||||
/*
|
||||
* Temporary pointers to the pivot row pointers
|
||||
*/
|
||||
pInT1 = pPivotRowIn;
|
||||
pOutT1 = pPivotRowDst;
|
||||
|
||||
/*
|
||||
* Pivot element of the row
|
||||
*/
|
||||
in = *(pIn + (l * numCols));
|
||||
|
||||
pTmpA = pInT1;
|
||||
|
||||
f16x8_t invIn = vdupq_n_f16(1.0f / in);
|
||||
|
||||
blkCnt = (numCols - l) >> 3;
|
||||
f16x8_t vecA;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
|
||||
pTmpA += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
*/
|
||||
blkCnt = (numCols - l) & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
|
||||
|
||||
vecA = vldrhq_f16(pTmpA);
|
||||
vecA = vecA * invIn;
|
||||
vstrhq_p_f16(pTmpA, vecA, p0);
|
||||
}
|
||||
|
||||
pInT1 += numCols - l;
|
||||
/*
|
||||
* Loop over number of columns
|
||||
* * to the right of the pilot element
|
||||
*/
|
||||
|
||||
pTmpA = pOutT1;
|
||||
blkCnt = numCols >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
|
||||
pTmpA += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = numCols & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
|
||||
vecA = vldrhq_f16(pTmpA);
|
||||
vecA = vecA * invIn;
|
||||
vstrhq_p_f16(pTmpA, vecA, p0);
|
||||
}
|
||||
|
||||
pOutT1 += numCols;
|
||||
|
||||
/*
|
||||
* Replace the rows with the sum of that row and a multiple of row i
|
||||
* * so that each new element in column i above row i is zero.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Temporary pointers for input and destination matrices
|
||||
*/
|
||||
pInT1 = pIn;
|
||||
pOutT1 = pOut;
|
||||
|
||||
for (i = 0U; i < numRows; i++)
|
||||
{
|
||||
/*
|
||||
* Check for the pivot element
|
||||
*/
|
||||
if (i == l)
|
||||
{
|
||||
/*
|
||||
* If the processing element is the pivot element,
|
||||
* only the columns to the right are to be processed
|
||||
*/
|
||||
pInT1 += numCols - l;
|
||||
pOutT1 += numCols;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Element of the reference row
|
||||
*/
|
||||
|
||||
/*
|
||||
* Working pointers for input and destination pivot rows
|
||||
*/
|
||||
pPRT_in = pPivotRowIn;
|
||||
pPRT_pDst = pPivotRowDst;
|
||||
/*
|
||||
* Loop over the number of columns to the right of the pivot element,
|
||||
* to replace the elements in the input matrix
|
||||
*/
|
||||
|
||||
in = *pInT1;
|
||||
f16x8_t tmpV = vdupq_n_f16(in);
|
||||
|
||||
blkCnt = (numCols - l) >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vec1, vec2;
|
||||
/*
|
||||
* Replace the element by the sum of that row
|
||||
* and a multiple of the reference row
|
||||
*/
|
||||
vec1 = vldrhq_f16(pInT1);
|
||||
vec2 = vldrhq_f16(pPRT_in);
|
||||
vec1 = vfmsq_f16(vec1, tmpV, vec2);
|
||||
vstrhq_f16(pInT1, vec1);
|
||||
pPRT_in += 8;
|
||||
pInT1 += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = (numCols - l) & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vec1, vec2;
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
|
||||
vec1 = vldrhq_f16(pInT1);
|
||||
vec2 = vldrhq_f16(pPRT_in);
|
||||
vec1 = vfmsq_f16(vec1, tmpV, vec2);
|
||||
vstrhq_p_f16(pInT1, vec1, p0);
|
||||
pInT1 += blkCnt;
|
||||
}
|
||||
|
||||
blkCnt = numCols >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vec1, vec2;
|
||||
|
||||
/*
|
||||
* Replace the element by the sum of that row
|
||||
* and a multiple of the reference row
|
||||
*/
|
||||
vec1 = vldrhq_f16(pOutT1);
|
||||
vec2 = vldrhq_f16(pPRT_pDst);
|
||||
vec1 = vfmsq_f16(vec1, tmpV, vec2);
|
||||
vstrhq_f16(pOutT1, vec1);
|
||||
pPRT_pDst += 8;
|
||||
pOutT1 += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = numCols & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vec1, vec2;
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
|
||||
vec1 = vldrhq_f16(pOutT1);
|
||||
vec2 = vldrhq_f16(pPRT_pDst);
|
||||
vec1 = vfmsq_f16(vec1, tmpV, vec2);
|
||||
vstrhq_p_f16(pOutT1, vec1, p0);
|
||||
|
||||
pInT2 += blkCnt;
|
||||
pOutT1 += blkCnt;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Increment the temporary input pointer
|
||||
*/
|
||||
pInT1 = pInT1 + l;
|
||||
}
|
||||
/*
|
||||
* Increment the input pointer
|
||||
*/
|
||||
pIn++;
|
||||
/*
|
||||
* Decrement the loop counter
|
||||
*/
|
||||
loopCnt--;
|
||||
/*
|
||||
* Increment the index modifier
|
||||
*/
|
||||
l++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set status as ARM_MATH_SUCCESS
|
||||
*/
|
||||
status = ARM_MATH_SUCCESS;
|
||||
|
||||
if ((flag != 1U) && (in == 0.0f))
|
||||
{
|
||||
pIn = pSrc->pData;
|
||||
for (i = 0; i < numRows * numCols; i++)
|
||||
{
|
||||
if (pIn[i] != 0.0f)
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == numRows * numCols)
|
||||
status = ARM_MATH_SINGULAR;
|
||||
}
|
||||
}
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
arm_status arm_mat_inverse_f16(
|
||||
const arm_matrix_instance_f16 * pSrc,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
|
||||
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
|
||||
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
|
||||
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
|
||||
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
|
||||
|
||||
float16_t Xchg, in = 0.0f, in1; /* Temporary input values */
|
||||
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
|
||||
arm_status status; /* status of matrix inverse */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrc->numRows != pSrc->numCols) ||
|
||||
(pDst->numRows != pDst->numCols) ||
|
||||
(pSrc->numRows != pDst->numRows) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
|
||||
/*--------------------------------------------------------------------------------------------------------------
|
||||
* Matrix Inverse can be solved using elementary row operations.
|
||||
*
|
||||
* Gauss-Jordan Method:
|
||||
*
|
||||
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
|
||||
* augmented matrix as follows:
|
||||
* _ _ _ _
|
||||
* | a11 a12 | 1 0 | | X11 X12 |
|
||||
* | | | = | |
|
||||
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
|
||||
*
|
||||
* 2. In our implementation, pDst Matrix is used as identity matrix.
|
||||
*
|
||||
* 3. Begin with the first row. Let i = 1.
|
||||
*
|
||||
* 4. Check to see if the pivot for row i is zero.
|
||||
* The pivot is the element of the main diagonal that is on the current row.
|
||||
* For instance, if working with row i, then the pivot element is aii.
|
||||
* If the pivot is zero, exchange that row with a row below it that does not
|
||||
* contain a zero in column i. If this is not possible, then an inverse
|
||||
* to that matrix does not exist.
|
||||
*
|
||||
* 5. Divide every element of row i by the pivot.
|
||||
*
|
||||
* 6. For every row below and row i, replace that row with the sum of that row and
|
||||
* a multiple of row i so that each new element in column i below row i is zero.
|
||||
*
|
||||
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
|
||||
* for every element below and above the main diagonal.
|
||||
*
|
||||
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
|
||||
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
|
||||
*----------------------------------------------------------------------------------------------------------------*/
|
||||
|
||||
/* Working pointer for destination matrix */
|
||||
pOutT1 = pOut;
|
||||
|
||||
/* Loop over the number of rows */
|
||||
rowCnt = numRows;
|
||||
|
||||
/* Making the destination matrix as identity matrix */
|
||||
while (rowCnt > 0U)
|
||||
{
|
||||
/* Writing all zeroes in lower triangle of the destination matrix */
|
||||
j = numRows - rowCnt;
|
||||
while (j > 0U)
|
||||
{
|
||||
*pOutT1++ = 0.0f;
|
||||
j--;
|
||||
}
|
||||
|
||||
/* Writing all ones in the diagonal of the destination matrix */
|
||||
*pOutT1++ = 1.0f;
|
||||
|
||||
/* Writing all zeroes in upper triangle of the destination matrix */
|
||||
j = rowCnt - 1U;
|
||||
while (j > 0U)
|
||||
{
|
||||
*pOutT1++ = 0.0f;
|
||||
j--;
|
||||
}
|
||||
|
||||
/* Decrement loop counter */
|
||||
rowCnt--;
|
||||
}
|
||||
|
||||
/* Loop over the number of columns of the input matrix.
|
||||
All the elements in each column are processed by the row operations */
|
||||
loopCnt = numCols;
|
||||
|
||||
/* Index modifier to navigate through the columns */
|
||||
l = 0U;
|
||||
|
||||
while (loopCnt > 0U)
|
||||
{
|
||||
/* Check if the pivot element is zero..
|
||||
* If it is zero then interchange the row with non zero row below.
|
||||
* If there is no non zero element to replace in the rows below,
|
||||
* then the matrix is Singular. */
|
||||
|
||||
/* Working pointer for the input matrix that points
|
||||
* to the pivot element of the particular row */
|
||||
pInT1 = pIn + (l * numCols);
|
||||
|
||||
/* Working pointer for the destination matrix that points
|
||||
* to the pivot element of the particular row */
|
||||
pOutT1 = pOut + (l * numCols);
|
||||
|
||||
/* Temporary variable to hold the pivot value */
|
||||
in = *pInT1;
|
||||
|
||||
|
||||
/* Destination pointer modifier */
|
||||
k = 1U;
|
||||
|
||||
/* Check if the pivot element is zero */
|
||||
if (*pInT1 == 0.0f)
|
||||
{
|
||||
/* Loop over the number rows present below */
|
||||
|
||||
for (i = (l + 1U); i < numRows; i++)
|
||||
{
|
||||
/* Update the input and destination pointers */
|
||||
pInT2 = pInT1 + (numCols * i);
|
||||
pOutT2 = pOutT1 + (numCols * k);
|
||||
|
||||
/* Check if there is a non zero pivot element to
|
||||
* replace in the rows below */
|
||||
if (*pInT2 != 0.0f)
|
||||
{
|
||||
/* Loop over number of columns
|
||||
* to the right of the pilot element */
|
||||
j = numCols - l;
|
||||
|
||||
while (j > 0U)
|
||||
{
|
||||
/* Exchange the row elements of the input matrix */
|
||||
Xchg = *pInT2;
|
||||
*pInT2++ = *pInT1;
|
||||
*pInT1++ = Xchg;
|
||||
|
||||
/* Decrement the loop counter */
|
||||
j--;
|
||||
}
|
||||
|
||||
/* Loop over number of columns of the destination matrix */
|
||||
j = numCols;
|
||||
|
||||
while (j > 0U)
|
||||
{
|
||||
/* Exchange the row elements of the destination matrix */
|
||||
Xchg = *pOutT2;
|
||||
*pOutT2++ = *pOutT1;
|
||||
*pOutT1++ = Xchg;
|
||||
|
||||
/* Decrement loop counter */
|
||||
j--;
|
||||
}
|
||||
|
||||
/* Flag to indicate whether exchange is done or not */
|
||||
flag = 1U;
|
||||
|
||||
/* Break after exchange is done */
|
||||
break;
|
||||
}
|
||||
|
||||
/* Update the destination pointer modifier */
|
||||
k++;
|
||||
|
||||
/* Decrement loop counter */
|
||||
}
|
||||
}
|
||||
|
||||
/* Update the status if the matrix is singular */
|
||||
if ((flag != 1U) && (in == 0.0f))
|
||||
{
|
||||
return ARM_MATH_SINGULAR;
|
||||
}
|
||||
|
||||
/* Points to the pivot row of input and destination matrices */
|
||||
pPivotRowIn = pIn + (l * numCols);
|
||||
pPivotRowDst = pOut + (l * numCols);
|
||||
|
||||
/* Temporary pointers to the pivot row pointers */
|
||||
pInT1 = pPivotRowIn;
|
||||
pInT2 = pPivotRowDst;
|
||||
|
||||
/* Pivot element of the row */
|
||||
in = *pPivotRowIn;
|
||||
|
||||
/* Loop over number of columns
|
||||
* to the right of the pilot element */
|
||||
j = (numCols - l);
|
||||
|
||||
while (j > 0U)
|
||||
{
|
||||
/* Divide each element of the row of the input matrix
|
||||
* by the pivot element */
|
||||
in1 = *pInT1;
|
||||
*pInT1++ = in1 / in;
|
||||
|
||||
/* Decrement the loop counter */
|
||||
j--;
|
||||
}
|
||||
|
||||
/* Loop over number of columns of the destination matrix */
|
||||
j = numCols;
|
||||
|
||||
while (j > 0U)
|
||||
{
|
||||
/* Divide each element of the row of the destination matrix
|
||||
* by the pivot element */
|
||||
in1 = *pInT2;
|
||||
*pInT2++ = in1 / in;
|
||||
|
||||
/* Decrement the loop counter */
|
||||
j--;
|
||||
}
|
||||
|
||||
/* Replace the rows with the sum of that row and a multiple of row i
|
||||
* so that each new element in column i above row i is zero.*/
|
||||
|
||||
/* Temporary pointers for input and destination matrices */
|
||||
pInT1 = pIn;
|
||||
pInT2 = pOut;
|
||||
|
||||
/* index used to check for pivot element */
|
||||
i = 0U;
|
||||
|
||||
/* Loop over number of rows */
|
||||
/* to be replaced by the sum of that row and a multiple of row i */
|
||||
k = numRows;
|
||||
|
||||
while (k > 0U)
|
||||
{
|
||||
/* Check for the pivot element */
|
||||
if (i == l)
|
||||
{
|
||||
/* If the processing element is the pivot element,
|
||||
only the columns to the right are to be processed */
|
||||
pInT1 += numCols - l;
|
||||
|
||||
pInT2 += numCols;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Element of the reference row */
|
||||
in = *pInT1;
|
||||
|
||||
/* Working pointers for input and destination pivot rows */
|
||||
pPRT_in = pPivotRowIn;
|
||||
pPRT_pDst = pPivotRowDst;
|
||||
|
||||
/* Loop over the number of columns to the right of the pivot element,
|
||||
to replace the elements in the input matrix */
|
||||
j = (numCols - l);
|
||||
|
||||
while (j > 0U)
|
||||
{
|
||||
/* Replace the element by the sum of that row
|
||||
and a multiple of the reference row */
|
||||
in1 = *pInT1;
|
||||
*pInT1++ = in1 - (in * *pPRT_in++);
|
||||
|
||||
/* Decrement the loop counter */
|
||||
j--;
|
||||
}
|
||||
|
||||
/* Loop over the number of columns to
|
||||
replace the elements in the destination matrix */
|
||||
j = numCols;
|
||||
|
||||
while (j > 0U)
|
||||
{
|
||||
/* Replace the element by the sum of that row
|
||||
and a multiple of the reference row */
|
||||
in1 = *pInT2;
|
||||
*pInT2++ = in1 - (in * *pPRT_pDst++);
|
||||
|
||||
/* Decrement loop counter */
|
||||
j--;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Increment temporary input pointer */
|
||||
pInT1 = pInT1 + l;
|
||||
|
||||
/* Decrement loop counter */
|
||||
k--;
|
||||
|
||||
/* Increment pivot index */
|
||||
i++;
|
||||
}
|
||||
|
||||
/* Increment the input pointer */
|
||||
pIn++;
|
||||
|
||||
/* Decrement the loop counter */
|
||||
loopCnt--;
|
||||
|
||||
/* Increment the index modifier */
|
||||
l++;
|
||||
}
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
|
||||
if ((flag != 1U) && (in == 0.0f))
|
||||
{
|
||||
pIn = pSrc->pData;
|
||||
for (i = 0; i < numRows * numCols; i++)
|
||||
{
|
||||
if (pIn[i] != 0.0f)
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == numRows * numCols)
|
||||
status = ARM_MATH_SINGULAR;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
@} end of MatrixInv group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,763 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_mult_f16.c
|
||||
* Description: Floating-point matrix multiplication
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
* @ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @addtogroup MatrixMult
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Floating-point matrix multiplication.
|
||||
* @param[in] *pSrcA points to the first input matrix structure
|
||||
* @param[in] *pSrcB points to the second input matrix structure
|
||||
* @param[out] *pDst points to output matrix structure
|
||||
* @return The function returns either
|
||||
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
|
||||
*/
|
||||
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
|
||||
const arm_matrix_instance_f16 *pSrcA,
|
||||
const arm_matrix_instance_f16 *pSrcB,
|
||||
arm_matrix_instance_f16 *pDst)
|
||||
{
|
||||
static const uint16_t offsetA[8] = { 0, 0, 2, 2, 0, 0, 2, 2 };
|
||||
/* offsetB allows to read and duplicate 1 row of B */
|
||||
static const uint16_t offsetB[8] = { 0, 1, 0, 1, 0, 1, 0, 1 };
|
||||
uint16x8_t vecOffsA, vecOffsB;
|
||||
f16x8_t vecInA, vecInB, vecDst;
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
|
||||
/*
|
||||
* load initial offsets
|
||||
*/
|
||||
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
|
||||
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
|
||||
/*
|
||||
* load {a00 a00 a10 a10 x x x x }
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* load {b00 b01 b00 b01 x x x x }
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 a00 b01
|
||||
* a10 b00 a10 b01
|
||||
* x x
|
||||
* x x }
|
||||
*/
|
||||
vecDst = vmulq(vecInA, vecInB);
|
||||
/*
|
||||
* move to 2nd column of matrix A
|
||||
*/
|
||||
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
|
||||
/*
|
||||
* load {a01 a01 a11 a11 x x x x}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* move to next B row
|
||||
*/
|
||||
vecOffsB = vaddq(vecOffsB, (uint16_t) 2);
|
||||
/*
|
||||
* load {b10, b11, b10, b11, x x x x }
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 + a01 b10 a00 b01 + a01 b11
|
||||
* a10 b00 + a11 b10 a10 b01 + a11 b11
|
||||
* x x
|
||||
* x x }
|
||||
*/
|
||||
vecDst = vfmaq(vecDst, vecInA, vecInB);
|
||||
|
||||
mve_pred16_t p0 = vctp16q(2*2);
|
||||
/*
|
||||
* Store the result in the destination buffer
|
||||
* (lower half of the vector)
|
||||
*/
|
||||
vstrhq_p(pOut, vecDst, p0);
|
||||
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
|
||||
const arm_matrix_instance_f16 *pSrcA,
|
||||
const arm_matrix_instance_f16 *pSrcB,
|
||||
arm_matrix_instance_f16 *pDst)
|
||||
{
|
||||
static const uint16_t offsetA[8] = { 0, 0, 0, 3, 3, 3, 6, 6 };
|
||||
/* offsetB allows to read and duplicate 1 row of B */
|
||||
static const uint16_t offsetB[8] = { 0, 1, 2, 0, 1, 2, 0, 1 };
|
||||
uint16x8_t vecOffsA, vecOffsB;
|
||||
f16x8_t vecInA, vecInB, vecDst;
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
|
||||
/*
|
||||
* load initial offsets
|
||||
*/
|
||||
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
|
||||
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
|
||||
|
||||
/*
|
||||
* load {a00 a00 a00 a10 a10 a10 a20 a20}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* load {b00 b01 b02 b00 b01 b02 b00 b01}
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 a00 b01 a00 b02
|
||||
* a10 b00 a10 b01 a10 b02
|
||||
* a20 b00 a20 b01}
|
||||
*/
|
||||
vecDst = vmulq(vecInA, vecInB);
|
||||
|
||||
/*
|
||||
* move to 2nd column of matrix A
|
||||
*/
|
||||
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
|
||||
/*
|
||||
* load {a01 a01 a01 a11 a11 a11 a21 a21}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* move to next B row
|
||||
*/
|
||||
vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
|
||||
/*
|
||||
* load {b10, b11, b12, b10, b11, b12, b10, b11}
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12
|
||||
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12
|
||||
* a20 b00 + a21 b10 a20 b01 + a21 b11 }
|
||||
*/
|
||||
vecDst = vfmaq(vecDst, vecInA, vecInB);
|
||||
/*
|
||||
* move to 3rd column of matrix A
|
||||
*/
|
||||
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
|
||||
/*
|
||||
* load {a02 a02 a02 a12 a12 a12 a22 a22}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* move to next B row
|
||||
*/
|
||||
vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
|
||||
/*
|
||||
* load {b20, b21, b22, b20, b21, b22, b20, b21}
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* {a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22},
|
||||
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22},
|
||||
* a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 }
|
||||
*/
|
||||
vecDst = vfmaq(vecDst, vecInA, vecInB);
|
||||
|
||||
/*
|
||||
* Store the result in the destination buffer
|
||||
*/
|
||||
vst1q(pOut, vecDst); pOut += 8;
|
||||
|
||||
/* last element computed in scalar mode
|
||||
* a20 b02 + a21 b12 + a22 b22
|
||||
*/
|
||||
_Float16 * pA = (_Float16 *)pSrcA->pData;
|
||||
_Float16 * pB = (_Float16 *)pSrcB->pData;
|
||||
*pOut = pA[2*3] * pB[2] + pA[2*3+1] * pB[3+2] + pA[2*3+2] * pB[2*3+2];
|
||||
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
|
||||
const arm_matrix_instance_f16 *pSrcA,
|
||||
const arm_matrix_instance_f16 *pSrcB,
|
||||
arm_matrix_instance_f16 *pDst)
|
||||
{
|
||||
/* offsetA allows to read and duplicate 2 successive column elements of A */
|
||||
static const uint16_t offsetA[8] = { 0, 0, 0, 0, 4, 4, 4, 4 };
|
||||
/* offsetB allows to read and duplicate 1 row of B */
|
||||
static const uint16_t offsetB[8] = { 0, 1, 2, 3, 0, 1, 2, 3 };
|
||||
uint16x8_t vecOffsA, vecOffsB;
|
||||
f16x8_t vecInA, vecInB, vecDst0, vecDst1;
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
|
||||
/*
|
||||
* load initial offsets
|
||||
*/
|
||||
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
|
||||
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
|
||||
|
||||
/*
|
||||
* load {a00 a00 a00 a00 a10 a10 a10 a10}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* load {b00 b01 b02 b03 b00 b01 b02 b03}
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 a00 b01 a00 b02 a00 b03
|
||||
* a10 b00 a10 b01 a10 b02 a10 b03 }
|
||||
*/
|
||||
vecDst0 = vmulq(vecInA, vecInB);
|
||||
/*
|
||||
* jump 2 x A rows (2nd half of matrix)
|
||||
*/
|
||||
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
|
||||
/*
|
||||
* load {a20 a20 a20 a20 a30 a30 a30 a30}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* { a20 b00 a20 b01 a20 b02 a20 b03
|
||||
* a30 b00 a30 b01 a30 b02 + a31 b12 }
|
||||
*/
|
||||
vecDst1 = vmulq(vecInA, vecInB);
|
||||
/*
|
||||
* rewind back to top half of the A matrix (2nd column)
|
||||
*/
|
||||
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
|
||||
/*
|
||||
* load {a01 a01 a01 a01 a11 a11 a11 a11}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* move to next B row
|
||||
*/
|
||||
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
|
||||
/*
|
||||
* load {b10, b11, b12, b13, b10, b11, b12, b13}
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12 a00 b03 + a01 b13
|
||||
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12 a10 b03 + a11 b13 }
|
||||
*/
|
||||
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
|
||||
/*
|
||||
* jump 2 x A rows (2nd half of matrix)
|
||||
*/
|
||||
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
|
||||
/*
|
||||
* load {a21 a21 a21 a21 a31 a31 a31 a31}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* {a20 b00 + a21 b10 a20 b01 + a21 b11 a20 b02 + a21 b12 a20 b03 + a21 b13
|
||||
* a30 b00 + a31 b10 a30 b01 + a31 b11 a30 b02 + a31 b12 a30 b03 + a31 b13 }
|
||||
*/
|
||||
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
|
||||
|
||||
/*
|
||||
* rewind back to top half of the A matrix (3rd column)
|
||||
*/
|
||||
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
|
||||
/*
|
||||
* load {a02 a02 a02 a02 a12 a12 a12 a12}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* move to next B row
|
||||
*/
|
||||
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
|
||||
/*
|
||||
* load {b20, b21, b22, b23, b20, b21, b22, b23}
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22 a00 b03 + a01 b13 + a02 b23
|
||||
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22 a10 b03 + a11 b13 + a12 b23 }
|
||||
*/
|
||||
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
|
||||
/*
|
||||
* jump 2 x A rows
|
||||
*/
|
||||
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
|
||||
|
||||
/*
|
||||
* load {a22 a22 a22 a22 a32 a32 a32 a32}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* {a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 a20 b02 + a21 b12 + a22 b22 a20 b03 + a21 b13 + a22 b23
|
||||
* a30 b00 + a31 b10 + a32 b20 a30 b01 + a31 b11 + a32 b21 a30 b02 + a31 b12 + a32 b22 a30 b03 + a31 b13 + a32 b23 }
|
||||
*/
|
||||
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
|
||||
|
||||
/*
|
||||
* rewind back to top half of the A matrix (4th column)
|
||||
*/
|
||||
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
|
||||
/*
|
||||
* load {a03 a03 a03 a03 a13 a13 a13 a13}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* move to next B row
|
||||
*/
|
||||
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
|
||||
/*
|
||||
* load {b30, b31, b32, b33, b30, b31, b32, b33}
|
||||
*/
|
||||
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
|
||||
/*
|
||||
* { a00 b00 +...+ a03 b30, a00 b01 +...+ a03 b31, a00 b02 +...+ a03 b32, a00 b03 +...+ a03 b33
|
||||
* a10 b00 +...+ a13 b30, a10 b01 +...+ a13 b31, a10 b02 +...+ a13 b32, a10 b03 +...+ a13 b33 }
|
||||
*/
|
||||
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
|
||||
/*
|
||||
* jump 2 x A rows
|
||||
*/
|
||||
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
|
||||
/*
|
||||
* load {a23 a23 a23 a23 a33 a33 a33 a33}
|
||||
*/
|
||||
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
|
||||
/*
|
||||
* {a20 b00 +...+ a23 b30, a20 b01 +...+ a23 b31, a20 b02 +...+ a23 b32, a20 b03 +...+ a23 b33
|
||||
* a30 b00 +...+ a33 b30, a30 b01 +...+ a33 b31, a30 b02 +...+ a33 b32, a30 b03 +...+ a33 b33 }
|
||||
*/
|
||||
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
|
||||
|
||||
/*
|
||||
* Store the result in the destination buffer
|
||||
*/
|
||||
vst1q(pOut, vecDst0); pOut += 8;
|
||||
vst1q(pOut, vecDst1);
|
||||
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
arm_status arm_mat_mult_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
|
||||
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
|
||||
int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
|
||||
int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
|
||||
uint32_t blkCnt; /* loop counters */
|
||||
int i;
|
||||
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numCols != pSrcB->numRows) ||
|
||||
(pSrcA->numRows != pDst->numRows) ||
|
||||
(pSrcB->numCols != pDst->numCols) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
{
|
||||
/* small squared matrix specialized routines */
|
||||
if(numRowsA == numColsB && numColsB == numColsA) {
|
||||
if(numRowsA == 2)
|
||||
return arm_mat_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
|
||||
else if(numRowsA == 3)
|
||||
return arm_mat_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
|
||||
else if(numRowsA == 4)
|
||||
return arm_mat_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
|
||||
}
|
||||
|
||||
/* main loop process 4 rows */
|
||||
i = numRowsA / 4;
|
||||
while(i > 0)
|
||||
{
|
||||
float16_t *pInA0, *pInA1, *pInA2, *pInA3;
|
||||
float16_t *pInB0;
|
||||
float16_t *pOut0, *pOut1, *pOut2, *pOut3;
|
||||
f16x8_t vecMac0, vecMac1, vecMac2, vecMac3;
|
||||
f16x8_t vecInB;
|
||||
|
||||
/* pointers to 4 consecutive output rows */
|
||||
pOut0 = pOut;
|
||||
pOut1 = pOut0 + numColsB;
|
||||
pOut2 = pOut1 + numColsB;
|
||||
pOut3 = pOut2 + numColsB;
|
||||
pInB0 = pInB;
|
||||
|
||||
int k = numColsB >> 3;
|
||||
while(k > 0)
|
||||
{
|
||||
/* pointers to 4 consecutive Matrix A rows */
|
||||
pInA0 = pInA;
|
||||
pInA1 = pInA0 + numColsA;
|
||||
pInA2 = pInA1 + numColsA;
|
||||
pInA3 = pInA2 + numColsA;
|
||||
|
||||
vecMac0 = vdupq_n_f16(0.0f16);
|
||||
vecMac1 = vdupq_n_f16(0.0f16);
|
||||
vecMac2 = vdupq_n_f16(0.0f16);
|
||||
vecMac3 = vdupq_n_f16(0.0f16);
|
||||
|
||||
blkCnt = numColsA;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3..., bi,4n+7}
|
||||
*/
|
||||
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
|
||||
|
||||
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
|
||||
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
|
||||
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
|
||||
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
|
||||
|
||||
pInB0 = pInB0 + numColsB;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Store the results (4 x 8 block) in the destination buffer */
|
||||
vst1q(pOut0, vecMac0); pOut0 += 8;
|
||||
vst1q(pOut1, vecMac1); pOut1 += 8;
|
||||
vst1q(pOut2, vecMac2); pOut2 += 8;
|
||||
vst1q(pOut3, vecMac3); pOut3 += 8;
|
||||
/*
|
||||
* rewind
|
||||
*/
|
||||
pInB0 -= (numColsB * numColsA) - 8;
|
||||
k--;
|
||||
}
|
||||
|
||||
int colBLeft = numColsB & 7;
|
||||
if (colBLeft)
|
||||
{
|
||||
pInA0 = pInA;
|
||||
pInA1 = pInA0 + numColsA;
|
||||
pInA2 = pInA1 + numColsA;
|
||||
pInA3 = pInA2 + numColsA;
|
||||
mve_pred16_t p0 = vctp16q(colBLeft);
|
||||
|
||||
vecMac0 = vdupq_n_f16(0.0f16);
|
||||
vecMac1 = vdupq_n_f16(0.0f16);
|
||||
vecMac2 = vdupq_n_f16(0.0f16);
|
||||
vecMac3 = vdupq_n_f16(0.0f16);
|
||||
|
||||
blkCnt = numColsA;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..bi,4n+colBLeft-1, 0, ..}
|
||||
*/
|
||||
vecInB = vldrhq_z_f16(pInB0, p0);
|
||||
|
||||
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
|
||||
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
|
||||
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
|
||||
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
|
||||
|
||||
pInB0 = pInB0 + numColsB;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Store the results (4 x colBLeft block) in the destination buffer */
|
||||
vstrhq_p_f16(pOut0, vecMac0, p0);
|
||||
vstrhq_p_f16(pOut1, vecMac1, p0);
|
||||
vstrhq_p_f16(pOut2, vecMac2, p0);
|
||||
vstrhq_p_f16(pOut3, vecMac3, p0);
|
||||
}
|
||||
|
||||
pInA += 4 * numColsA;
|
||||
pOut += 4 * numColsB;
|
||||
i--;
|
||||
}
|
||||
|
||||
/*
|
||||
* non multiple of 4 rows for Matrix A
|
||||
* process single row
|
||||
*/
|
||||
if (numRowsA & 3)
|
||||
{
|
||||
i = numRowsA & 3;
|
||||
do
|
||||
{
|
||||
float16_t *pInA0;
|
||||
float16_t *pInB0;
|
||||
float16_t *pOut0;
|
||||
f16x8_t vecInB;
|
||||
f16x8_t vecMac0;
|
||||
|
||||
pOut0 = pOut;
|
||||
pInB0 = pInB;
|
||||
|
||||
int k = numColsB >> 3;
|
||||
while(k > 0)
|
||||
{
|
||||
pInA0 = pInA;
|
||||
|
||||
vecMac0 = vdupq_n_f16(0.0f16);
|
||||
blkCnt = numColsA;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3, ...bi,4n+7}
|
||||
*/
|
||||
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
|
||||
|
||||
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
|
||||
|
||||
pInB0 = pInB0 + numColsB;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/* Store the results (1 x 8 block) in the destination buffer */
|
||||
vst1q(pOut0, vecMac0); pOut0 += 8;
|
||||
/*
|
||||
* rewind
|
||||
*/
|
||||
pInB0 -= (numColsB * numColsA) - 8;
|
||||
k--;
|
||||
}
|
||||
|
||||
int colBLeft = numColsB & 7;
|
||||
if (colBLeft)
|
||||
{
|
||||
pInA0 = pInA;
|
||||
mve_pred16_t p0 = vctp16q(colBLeft);
|
||||
|
||||
vecMac0 = vdupq_n_f16(0.0f16);
|
||||
blkCnt = numColsA;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..., bi,4n+colBLeft, 0, ...}
|
||||
*/
|
||||
vecInB = vldrhq_z_f16(pInB0, p0);
|
||||
|
||||
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
|
||||
|
||||
pInB0 = pInB0 + numColsB;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/* Store the results (1 x colBLeft block) in the destination buffer */
|
||||
vstrhq_p_f16(pOut0, vecMac0, p0);
|
||||
}
|
||||
|
||||
pInA += 1 * numColsA;
|
||||
pOut += 1 * numColsB;
|
||||
}
|
||||
while (--i);
|
||||
}
|
||||
/*
|
||||
* Return to application
|
||||
*/
|
||||
return (ARM_MATH_SUCCESS);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
|
||||
arm_status arm_mat_mult_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
|
||||
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
|
||||
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
|
||||
float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
|
||||
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
|
||||
float16_t *px; /* Temporary output data matrix pointer */
|
||||
float16_t sum; /* Accumulator */
|
||||
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
|
||||
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
|
||||
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
|
||||
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
|
||||
arm_status status; /* Status of matrix multiplication */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numCols != pSrcB->numRows) ||
|
||||
(pSrcA->numRows != pDst->numRows) ||
|
||||
(pSrcB->numCols != pDst->numCols) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
|
||||
/* row loop */
|
||||
do
|
||||
{
|
||||
/* Output pointer is set to starting address of row being processed */
|
||||
px = pOut + i;
|
||||
|
||||
/* For every row wise process, column loop counter is to be initiated */
|
||||
col = numColsB;
|
||||
|
||||
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
|
||||
pIn2 = pSrcB->pData;
|
||||
|
||||
/* column loop */
|
||||
do
|
||||
{
|
||||
/* Set the variable sum, that acts as accumulator, to zero */
|
||||
sum = 0.0f;
|
||||
|
||||
/* Initialize pointer pIn1 to point to starting address of column being processed */
|
||||
pIn1 = pInA;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 MACs at a time. */
|
||||
colCnt = numColsA >> 2U;
|
||||
|
||||
/* matrix multiplication */
|
||||
while (colCnt > 0U)
|
||||
{
|
||||
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
|
||||
|
||||
/* Perform the multiply-accumulates */
|
||||
sum += *pIn1++ * *pIn2;
|
||||
pIn2 += numColsB;
|
||||
|
||||
sum += *pIn1++ * *pIn2;
|
||||
pIn2 += numColsB;
|
||||
|
||||
sum += *pIn1++ * *pIn2;
|
||||
pIn2 += numColsB;
|
||||
|
||||
sum += *pIn1++ * *pIn2;
|
||||
pIn2 += numColsB;
|
||||
|
||||
/* Decrement loop counter */
|
||||
colCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining MACs */
|
||||
colCnt = numColsA % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize cntCnt with number of columns */
|
||||
colCnt = numColsA;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (colCnt > 0U)
|
||||
{
|
||||
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
|
||||
|
||||
/* Perform the multiply-accumulates */
|
||||
sum += *pIn1++ * *pIn2;
|
||||
pIn2 += numColsB;
|
||||
|
||||
/* Decrement loop counter */
|
||||
colCnt--;
|
||||
}
|
||||
|
||||
/* Store result in destination buffer */
|
||||
*px++ = sum;
|
||||
|
||||
/* Decrement column loop counter */
|
||||
col--;
|
||||
|
||||
/* Update pointer pIn2 to point to starting address of next column */
|
||||
pIn2 = pInB + (numColsB - col);
|
||||
|
||||
} while (col > 0U);
|
||||
|
||||
/* Update pointer pInA to point to starting address of next row */
|
||||
i = i + numColsB;
|
||||
pInA = pInA + numColsA;
|
||||
|
||||
/* Decrement row loop counter */
|
||||
row--;
|
||||
|
||||
} while (row > 0U);
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
* @} end of MatrixMult group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,206 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_scale_f16.c
|
||||
* Description: Multiplies a floating-point matrix by a scalar
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@addtogroup MatrixScale
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Floating-point matrix scaling.
|
||||
@param[in] pSrc points to input matrix
|
||||
@param[in] scale scale factor to be applied
|
||||
@param[out] pDst points to output matrix structure
|
||||
@return execution status
|
||||
- \ref ARM_MATH_SUCCESS : Operation successful
|
||||
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
arm_status arm_mat_scale_f16(
|
||||
const arm_matrix_instance_f16 * pSrc,
|
||||
float16_t scale,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
arm_status status; /* status of matrix scaling */
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
{
|
||||
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
uint32_t numSamples; /* total number of elements in the matrix */
|
||||
uint32_t blkCnt; /* loop counters */
|
||||
f16x8_t vecIn, vecOut;
|
||||
float16_t const *pInVec;
|
||||
|
||||
pInVec = (float16_t const *) pIn;
|
||||
/*
|
||||
* Total number of samples in the input matrix
|
||||
*/
|
||||
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
|
||||
blkCnt = numSamples >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/*
|
||||
* C(m,n) = A(m,n) * scale
|
||||
* Scaling and results are stored in the destination buffer.
|
||||
*/
|
||||
vecIn = vld1q(pInVec);
|
||||
pInVec += 8;
|
||||
|
||||
vecOut = vmulq(vecIn, scale);
|
||||
|
||||
vst1q(pOut, vecOut);
|
||||
pOut += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
*/
|
||||
blkCnt = numSamples & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
vecIn = vld1q(pInVec);
|
||||
vecOut = vecIn * scale;
|
||||
|
||||
vstrhq_p(pOut, vecOut, p0);
|
||||
}
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
arm_status arm_mat_scale_f16(
|
||||
const arm_matrix_instance_f16 * pSrc,
|
||||
float16_t scale,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pIn = pSrc->pData; /* Input data matrix pointer */
|
||||
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
|
||||
uint32_t numSamples; /* Total number of elements in the matrix */
|
||||
uint32_t blkCnt; /* Loop counters */
|
||||
arm_status status; /* Status of matrix scaling */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrc->numRows != pDst->numRows) ||
|
||||
(pSrc->numCols != pDst->numCols) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
/* Total number of samples in input matrix */
|
||||
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
blkCnt = numSamples >> 2U;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) * scale */
|
||||
|
||||
/* Scale and store result in destination buffer. */
|
||||
*pOut++ = (*pIn++) * scale;
|
||||
*pOut++ = (*pIn++) * scale;
|
||||
*pOut++ = (*pIn++) * scale;
|
||||
*pOut++ = (*pIn++) * scale;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
blkCnt = numSamples % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of samples */
|
||||
blkCnt = numSamples;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) * scale */
|
||||
|
||||
/* Scale and store result in destination buffer. */
|
||||
*pOut++ = (*pIn++) * scale;
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
@} end of MatrixScale group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,215 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_sub_f16.c
|
||||
* Description: Floating-point matrix subtraction
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@addtogroup MatrixSub
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Floating-point matrix subtraction.
|
||||
@param[in] pSrcA points to the first input matrix structure
|
||||
@param[in] pSrcB points to the second input matrix structure
|
||||
@param[out] pDst points to output matrix structure
|
||||
@return execution status
|
||||
- \ref ARM_MATH_SUCCESS : Operation successful
|
||||
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
arm_status arm_mat_sub_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
arm_status status; /* status of matrix subtraction */
|
||||
uint32_t numSamples; /* total number of elements in the matrix */
|
||||
float16_t *pDataA, *pDataB, *pDataDst;
|
||||
f16x8_t vecA, vecB, vecDst;
|
||||
float16_t const *pSrcAVec;
|
||||
float16_t const *pSrcBVec;
|
||||
uint32_t blkCnt; /* loop counters */
|
||||
|
||||
pDataA = pSrcA->pData;
|
||||
pDataB = pSrcB->pData;
|
||||
pDataDst = pDst->pData;
|
||||
pSrcAVec = (float16_t const *) pDataA;
|
||||
pSrcBVec = (float16_t const *) pDataB;
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numRows != pSrcB->numRows) ||
|
||||
(pSrcA->numCols != pSrcB->numCols) ||
|
||||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
{
|
||||
/*
|
||||
* Total number of samples in the input matrix
|
||||
*/
|
||||
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
|
||||
blkCnt = numSamples >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) + B(m,n) */
|
||||
/* sub and then store the results in the destination buffer. */
|
||||
vecA = vld1q(pSrcAVec);
|
||||
pSrcAVec += 8;
|
||||
vecB = vld1q(pSrcBVec);
|
||||
pSrcBVec += 8;
|
||||
vecDst = vsubq(vecA, vecB);
|
||||
vst1q(pDataDst, vecDst);
|
||||
pDataDst += 8;
|
||||
/*
|
||||
* Decrement the blockSize loop counter
|
||||
*/
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = numSamples & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
vecA = vld1q(pSrcAVec);
|
||||
vecB = vld1q(pSrcBVec);
|
||||
vecDst = vsubq_m(vecDst, vecA, vecB, p0);
|
||||
vstrhq_p(pDataDst, vecDst, p0);
|
||||
}
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
arm_status arm_mat_sub_f16(
|
||||
const arm_matrix_instance_f16 * pSrcA,
|
||||
const arm_matrix_instance_f16 * pSrcB,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
|
||||
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
|
||||
uint32_t numSamples; /* total number of elements in the matrix */
|
||||
uint32_t blkCnt; /* loop counters */
|
||||
arm_status status; /* status of matrix subtraction */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrcA->numRows != pSrcB->numRows) ||
|
||||
(pSrcA->numCols != pSrcB->numCols) ||
|
||||
(pSrcA->numRows != pDst->numRows) ||
|
||||
(pSrcA->numCols != pDst->numCols) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
/* Total number of samples in input matrix */
|
||||
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
blkCnt = numSamples >> 2U;
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) - B(m,n) */
|
||||
|
||||
/* Subtract and store result in destination buffer. */
|
||||
*pOut++ = (*pInA++) - (*pInB++);
|
||||
*pOut++ = (*pInA++) - (*pInB++);
|
||||
*pOut++ = (*pInA++) - (*pInB++);
|
||||
*pOut++ = (*pInA++) - (*pInB++);
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
blkCnt = numSamples % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize blkCnt with number of samples */
|
||||
blkCnt = numSamples;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
/* C(m,n) = A(m,n) - B(m,n) */
|
||||
|
||||
/* Subtract and store result in destination buffer. */
|
||||
*pOut++ = (*pInA++) - (*pInB++);
|
||||
|
||||
/* Decrement loop counter */
|
||||
blkCnt--;
|
||||
}
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
@} end of MatrixSub group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,202 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_trans_f16.c
|
||||
* Description: Floating-point matrix transpose
|
||||
*
|
||||
* $Date: 18. March 2020
|
||||
* $Revision: V1.6.0
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
@ingroup groupMatrix
|
||||
*/
|
||||
|
||||
/**
|
||||
@addtogroup MatrixTrans
|
||||
@{
|
||||
*/
|
||||
|
||||
/**
|
||||
@brief Floating-point matrix transpose.
|
||||
@param[in] pSrc points to input matrix
|
||||
@param[out] pDst points to output matrix
|
||||
@return execution status
|
||||
- \ref ARM_MATH_SUCCESS : Operation successful
|
||||
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
#include "arm_helium_utils.h"
|
||||
|
||||
arm_status arm_mat_trans_f16(
|
||||
const arm_matrix_instance_f16 * pSrc,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
arm_status status; /* status of matrix transpose */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrc->numRows != pDst->numCols) ||
|
||||
(pSrc->numCols != pDst->numRows) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
if (pDst->numRows == pDst->numCols)
|
||||
{
|
||||
if (pDst->numCols == 1)
|
||||
{
|
||||
pDst->pData[0] = pSrc->pData[0];
|
||||
return(ARM_MATH_SUCCESS);
|
||||
}
|
||||
if (pDst->numCols == 2)
|
||||
return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
|
||||
if (pDst->numCols == 3)
|
||||
return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
|
||||
if (pDst->numCols == 4)
|
||||
return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
|
||||
}
|
||||
|
||||
arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
arm_status arm_mat_trans_f16(
|
||||
const arm_matrix_instance_f16 * pSrc,
|
||||
arm_matrix_instance_f16 * pDst)
|
||||
{
|
||||
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
|
||||
float16_t *pOut = pDst->pData; /* output data matrix pointer */
|
||||
float16_t *px; /* Temporary output data matrix pointer */
|
||||
uint16_t nRows = pSrc->numRows; /* number of rows */
|
||||
uint16_t nCols = pSrc->numCols; /* number of columns */
|
||||
uint32_t col, row = nRows, i = 0U; /* Loop counters */
|
||||
arm_status status; /* status of matrix transpose */
|
||||
|
||||
#ifdef ARM_MATH_MATRIX_CHECK
|
||||
|
||||
/* Check for matrix mismatch condition */
|
||||
if ((pSrc->numRows != pDst->numCols) ||
|
||||
(pSrc->numCols != pDst->numRows) )
|
||||
{
|
||||
/* Set status as ARM_MATH_SIZE_MISMATCH */
|
||||
status = ARM_MATH_SIZE_MISMATCH;
|
||||
}
|
||||
else
|
||||
|
||||
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
|
||||
|
||||
{
|
||||
/* Matrix transpose by exchanging the rows with columns */
|
||||
/* row loop */
|
||||
do
|
||||
{
|
||||
/* Pointer px is set to starting address of column being processed */
|
||||
px = pOut + i;
|
||||
|
||||
#if defined (ARM_MATH_LOOPUNROLL)
|
||||
|
||||
/* Loop unrolling: Compute 4 outputs at a time */
|
||||
col = nCols >> 2U;
|
||||
|
||||
while (col > 0U) /* column loop */
|
||||
{
|
||||
/* Read and store input element in destination */
|
||||
*px = *pIn++;
|
||||
/* Update pointer px to point to next row of transposed matrix */
|
||||
px += nRows;
|
||||
|
||||
*px = *pIn++;
|
||||
px += nRows;
|
||||
|
||||
*px = *pIn++;
|
||||
px += nRows;
|
||||
|
||||
*px = *pIn++;
|
||||
px += nRows;
|
||||
|
||||
/* Decrement column loop counter */
|
||||
col--;
|
||||
}
|
||||
|
||||
/* Loop unrolling: Compute remaining outputs */
|
||||
col = nCols % 0x4U;
|
||||
|
||||
#else
|
||||
|
||||
/* Initialize col with number of samples */
|
||||
col = nCols;
|
||||
|
||||
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
|
||||
|
||||
while (col > 0U)
|
||||
{
|
||||
/* Read and store input element in destination */
|
||||
*px = *pIn++;
|
||||
|
||||
/* Update pointer px to point to next row of transposed matrix */
|
||||
px += nRows;
|
||||
|
||||
/* Decrement column loop counter */
|
||||
col--;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
/* Decrement row loop counter */
|
||||
row--;
|
||||
|
||||
} while (row > 0U); /* row loop end */
|
||||
|
||||
/* Set status as ARM_MATH_SUCCESS */
|
||||
status = ARM_MATH_SUCCESS;
|
||||
}
|
||||
|
||||
/* Return to application */
|
||||
return (status);
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
* @} end of MatrixTrans group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,394 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
* Project: CMSIS DSP Library
|
||||
* Title: arm_mat_vec_mult_f16.c
|
||||
* Description: Floating-point matrix and vector multiplication
|
||||
*
|
||||
* $Date: 07. July 202
|
||||
*
|
||||
* Target Processor: Cortex-M cores
|
||||
* -------------------------------------------------------------------- */
|
||||
/*
|
||||
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the License); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
#if defined(ARM_FLOAT16_SUPPORTED)
|
||||
|
||||
|
||||
/**
|
||||
* @ingroup groupMatrix
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @addtogroup MatrixVectMult
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Floating-point matrix and vector multiplication.
|
||||
* @param[in] *pSrcMat points to the input matrix structure
|
||||
* @param[in] *pVec points to input vector
|
||||
* @param[out] *pDst points to output vector
|
||||
*/
|
||||
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
||||
|
||||
#include "arm_helium_utils.h"
|
||||
|
||||
void arm_mat_vec_mult_f16(
|
||||
const arm_matrix_instance_f16 *pSrcMat,
|
||||
const float16_t *pSrcVec,
|
||||
float16_t *pDstVec)
|
||||
{
|
||||
uint32_t numRows = pSrcMat->numRows;
|
||||
uint32_t numCols = pSrcMat->numCols;
|
||||
const float16_t *pSrcA = pSrcMat->pData;
|
||||
const float16_t *pInA0;
|
||||
const float16_t *pInA1;
|
||||
float16_t *px;
|
||||
int32_t row;
|
||||
uint32_t blkCnt; /* loop counters */
|
||||
|
||||
row = numRows;
|
||||
px = pDstVec;
|
||||
|
||||
/*
|
||||
* compute 4 rows in parallel
|
||||
*/
|
||||
while (row >= 4)
|
||||
{
|
||||
const float16_t *pInA2, *pInA3;
|
||||
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
|
||||
f16x8_t vecIn, acc0, acc1, acc2, acc3;
|
||||
float16_t const *pSrcVecPtr = pSrcVec;
|
||||
|
||||
/*
|
||||
* Initialize the pointers to 4 consecutive MatrixA rows
|
||||
*/
|
||||
pInA0 = pSrcA;
|
||||
pInA1 = pInA0 + numCols;
|
||||
pInA2 = pInA1 + numCols;
|
||||
pInA3 = pInA2 + numCols;
|
||||
/*
|
||||
* Initialize the vector pointer
|
||||
*/
|
||||
pInVec = pSrcVecPtr;
|
||||
/*
|
||||
* reset accumulators
|
||||
*/
|
||||
acc0 = vdupq_n_f16(0.0f);
|
||||
acc1 = vdupq_n_f16(0.0f);
|
||||
acc2 = vdupq_n_f16(0.0f);
|
||||
acc3 = vdupq_n_f16(0.0f);
|
||||
|
||||
pSrcA0Vec = pInA0;
|
||||
pSrcA1Vec = pInA1;
|
||||
pSrcA2Vec = pInA2;
|
||||
pSrcA3Vec = pInA3;
|
||||
|
||||
blkCnt = numCols >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vecA;
|
||||
|
||||
vecIn = vld1q(pInVec);
|
||||
pInVec += 8;
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
pSrcA0Vec += 8;
|
||||
acc0 = vfmaq(acc0, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA1Vec);
|
||||
pSrcA1Vec += 8;
|
||||
acc1 = vfmaq(acc1, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA2Vec);
|
||||
pSrcA2Vec += 8;
|
||||
acc2 = vfmaq(acc2, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA3Vec);
|
||||
pSrcA3Vec += 8;
|
||||
acc3 = vfmaq(acc3, vecIn, vecA);
|
||||
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = numCols & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
f16x8_t vecA;
|
||||
|
||||
vecIn = vldrhq_z_f16(pInVec, p0);
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
acc0 = vfmaq(acc0, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA1Vec);
|
||||
acc1 = vfmaq(acc1, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA2Vec);
|
||||
acc2 = vfmaq(acc2, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA3Vec);
|
||||
acc3 = vfmaq(acc3, vecIn, vecA);
|
||||
}
|
||||
/*
|
||||
* Sum the partial parts
|
||||
*/
|
||||
*px++ = vecAddAcrossF16Mve(acc0);
|
||||
*px++ = vecAddAcrossF16Mve(acc1);
|
||||
*px++ = vecAddAcrossF16Mve(acc2);
|
||||
*px++ = vecAddAcrossF16Mve(acc3);
|
||||
|
||||
pSrcA += numCols * 4;
|
||||
/*
|
||||
* Decrement the row loop counter
|
||||
*/
|
||||
row -= 4;
|
||||
}
|
||||
|
||||
/*
|
||||
* compute 2 rows in parrallel
|
||||
*/
|
||||
if (row >= 2)
|
||||
{
|
||||
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
|
||||
f16x8_t vecIn, acc0, acc1;
|
||||
float16_t const *pSrcVecPtr = pSrcVec;
|
||||
|
||||
/*
|
||||
* Initialize the pointers to 2 consecutive MatrixA rows
|
||||
*/
|
||||
pInA0 = pSrcA;
|
||||
pInA1 = pInA0 + numCols;
|
||||
/*
|
||||
* Initialize the vector pointer
|
||||
*/
|
||||
pInVec = pSrcVecPtr;
|
||||
/*
|
||||
* reset accumulators
|
||||
*/
|
||||
acc0 = vdupq_n_f16(0.0f);
|
||||
acc1 = vdupq_n_f16(0.0f);
|
||||
pSrcA0Vec = pInA0;
|
||||
pSrcA1Vec = pInA1;
|
||||
|
||||
blkCnt = numCols >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vecA;
|
||||
|
||||
vecIn = vld1q(pInVec);
|
||||
pInVec += 8;
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
pSrcA0Vec += 8;
|
||||
acc0 = vfmaq(acc0, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA1Vec);
|
||||
pSrcA1Vec += 8;
|
||||
acc1 = vfmaq(acc1, vecIn, vecA);
|
||||
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = numCols & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
f16x8_t vecA;
|
||||
|
||||
vecIn = vldrhq_z_f16(pInVec, p0);
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
acc0 = vfmaq(acc0, vecIn, vecA);
|
||||
vecA = vld1q(pSrcA1Vec);
|
||||
acc1 = vfmaq(acc1, vecIn, vecA);
|
||||
}
|
||||
/*
|
||||
* Sum the partial parts
|
||||
*/
|
||||
*px++ = vecAddAcrossF16Mve(acc0);
|
||||
*px++ = vecAddAcrossF16Mve(acc1);
|
||||
|
||||
pSrcA += numCols * 2;
|
||||
row -= 2;
|
||||
}
|
||||
|
||||
if (row >= 1)
|
||||
{
|
||||
f16x8_t vecIn, acc0;
|
||||
float16_t const *pSrcA0Vec, *pInVec;
|
||||
float16_t const *pSrcVecPtr = pSrcVec;
|
||||
/*
|
||||
* Initialize the pointers to last MatrixA row
|
||||
*/
|
||||
pInA0 = pSrcA;
|
||||
/*
|
||||
* Initialize the vector pointer
|
||||
*/
|
||||
pInVec = pSrcVecPtr;
|
||||
/*
|
||||
* reset accumulators
|
||||
*/
|
||||
acc0 = vdupq_n_f16(0.0f);
|
||||
|
||||
pSrcA0Vec = pInA0;
|
||||
|
||||
blkCnt = numCols >> 3;
|
||||
while (blkCnt > 0U)
|
||||
{
|
||||
f16x8_t vecA;
|
||||
|
||||
vecIn = vld1q(pInVec);
|
||||
pInVec += 8;
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
pSrcA0Vec += 8;
|
||||
acc0 = vfmaq(acc0, vecIn, vecA);
|
||||
|
||||
blkCnt--;
|
||||
}
|
||||
/*
|
||||
* tail
|
||||
* (will be merged thru tail predication)
|
||||
*/
|
||||
blkCnt = numCols & 7;
|
||||
if (blkCnt > 0U)
|
||||
{
|
||||
mve_pred16_t p0 = vctp16q(blkCnt);
|
||||
f16x8_t vecA;
|
||||
|
||||
vecIn = vldrhq_z_f16(pInVec, p0);
|
||||
vecA = vld1q(pSrcA0Vec);
|
||||
acc0 = vfmaq(acc0, vecIn, vecA);
|
||||
}
|
||||
/*
|
||||
* Sum the partial parts
|
||||
*/
|
||||
*px++ = vecAddAcrossF16Mve(acc0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
void arm_mat_vec_mult_f16(const arm_matrix_instance_f16 *pSrcMat, const float16_t *pVec, float16_t *pDst)
|
||||
{
|
||||
uint32_t numRows = pSrcMat->numRows;
|
||||
uint32_t numCols = pSrcMat->numCols;
|
||||
const float16_t *pSrcA = pSrcMat->pData;
|
||||
const float16_t *pInA1; /* input data matrix pointer A of Q31 type */
|
||||
const float16_t *pInA2; /* input data matrix pointer A of Q31 type */
|
||||
const float16_t *pInA3; /* input data matrix pointer A of Q31 type */
|
||||
const float16_t *pInA4; /* input data matrix pointer A of Q31 type */
|
||||
const float16_t *pInVec; /* input data matrix pointer B of Q31 type */
|
||||
float16_t *px; /* Temporary output data matrix pointer */
|
||||
uint16_t i, row, colCnt; /* loop counters */
|
||||
float16_t matData, matData2, vecData, vecData2;
|
||||
|
||||
|
||||
/* Process 4 rows at a time */
|
||||
row = numRows >> 2;
|
||||
i = 0u;
|
||||
px = pDst;
|
||||
|
||||
/* The following loop performs the dot-product of each row in pSrcA with the vector */
|
||||
/* row loop */
|
||||
while (row > 0) {
|
||||
/* For every row wise process, the pInVec pointer is set
|
||||
** to the starting address of the vector */
|
||||
pInVec = pVec;
|
||||
|
||||
/* Initialize accumulators */
|
||||
float16_t sum1 = 0.0f;
|
||||
float16_t sum2 = 0.0f;
|
||||
float16_t sum3 = 0.0f;
|
||||
float16_t sum4 = 0.0f;
|
||||
|
||||
/* Loop unrolling: process 2 columns per iteration */
|
||||
colCnt = numCols;
|
||||
|
||||
/* Initialize pointers to the starting address of the column being processed */
|
||||
pInA1 = pSrcA + i;
|
||||
pInA2 = pInA1 + numCols;
|
||||
pInA3 = pInA2 + numCols;
|
||||
pInA4 = pInA3 + numCols;
|
||||
|
||||
|
||||
// Main loop: matrix-vector multiplication
|
||||
while (colCnt > 0u) {
|
||||
// Read 2 values from vector
|
||||
vecData = *(pInVec)++;
|
||||
// Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
|
||||
matData = *(pInA1)++;
|
||||
sum1 += matData * vecData;
|
||||
matData = *(pInA2)++;
|
||||
sum2 += matData * vecData;
|
||||
matData = *(pInA3)++;
|
||||
sum3 += matData * vecData;
|
||||
matData = *(pInA4)++;
|
||||
sum4 += matData * vecData;
|
||||
|
||||
// Decrement the loop counter
|
||||
colCnt--;
|
||||
}
|
||||
|
||||
/* Saturate and store the result in the destination buffer */
|
||||
*px++ = sum1;
|
||||
*px++ = sum2;
|
||||
*px++ = sum3;
|
||||
*px++ = sum4;
|
||||
|
||||
i = i + numCols * 4;
|
||||
|
||||
/* Decrement the row loop counter */
|
||||
row--;
|
||||
}
|
||||
|
||||
/* process any remaining rows */
|
||||
row = numRows & 3u;
|
||||
while (row > 0) {
|
||||
|
||||
float16_t sum = 0.0f;
|
||||
pInVec = pVec;
|
||||
pInA1 = pSrcA + i;
|
||||
|
||||
colCnt = numCols >> 1;
|
||||
|
||||
while (colCnt > 0) {
|
||||
vecData = *(pInVec)++;
|
||||
vecData2 = *(pInVec)++;
|
||||
matData = *(pInA1)++;
|
||||
matData2 = *(pInA1)++;
|
||||
sum += matData * vecData;
|
||||
sum += matData2 * vecData2;
|
||||
colCnt--;
|
||||
}
|
||||
// process remainder of row
|
||||
colCnt = numCols & 1u;
|
||||
while (colCnt > 0) {
|
||||
sum += *pInA1++ * *pInVec++;
|
||||
colCnt--;
|
||||
}
|
||||
|
||||
*px++ = sum;
|
||||
i = i + numCols;
|
||||
row--;
|
||||
}
|
||||
}
|
||||
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
||||
|
||||
/**
|
||||
* @} end of MatrixMult group
|
||||
*/
|
||||
|
||||
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|
||||
|
||||
@ -0,0 +1,32 @@
|
||||
#include "Test.h"
|
||||
#include "Pattern.h"
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
class BinaryTestsF16:public Client::Suite
|
||||
{
|
||||
public:
|
||||
BinaryTestsF16(Testing::testID_t id);
|
||||
virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
|
||||
virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
|
||||
private:
|
||||
#include "BinaryTestsF16_decl.h"
|
||||
Client::Pattern<float16_t> input1;
|
||||
Client::Pattern<float16_t> input2;
|
||||
Client::Pattern<float16_t> ref;
|
||||
Client::Pattern<int16_t> dims;
|
||||
Client::LocalPattern<float16_t> output;
|
||||
|
||||
/* Local copies of inputs since matrix instance in CMSIS-DSP are not using
|
||||
pointers to const.
|
||||
*/
|
||||
Client::LocalPattern<float16_t> a;
|
||||
Client::LocalPattern<float16_t> b;
|
||||
|
||||
int nbr;
|
||||
int nbc;
|
||||
|
||||
arm_matrix_instance_f16 in1;
|
||||
arm_matrix_instance_f16 in2;
|
||||
arm_matrix_instance_f16 out;
|
||||
};
|
||||
@ -0,0 +1,32 @@
|
||||
#include "Test.h"
|
||||
#include "Pattern.h"
|
||||
|
||||
#include "dsp/matrix_functions_f16.h"
|
||||
|
||||
class UnaryTestsF16:public Client::Suite
|
||||
{
|
||||
public:
|
||||
UnaryTestsF16(Testing::testID_t id);
|
||||
virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
|
||||
virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
|
||||
private:
|
||||
#include "UnaryTestsF16_decl.h"
|
||||
Client::Pattern<float16_t> input1;
|
||||
Client::Pattern<float16_t> input2;
|
||||
Client::Pattern<float16_t> ref;
|
||||
Client::Pattern<int16_t> dims;
|
||||
Client::LocalPattern<float16_t> output;
|
||||
|
||||
/* Local copies of inputs since matrix instance in CMSIS-DSP are not using
|
||||
pointers to const.
|
||||
*/
|
||||
Client::LocalPattern<float16_t> a;
|
||||
Client::LocalPattern<float16_t> b;
|
||||
|
||||
int nbr;
|
||||
int nbc;
|
||||
|
||||
arm_matrix_instance_f16 in1;
|
||||
arm_matrix_instance_f16 in2;
|
||||
arm_matrix_instance_f16 out;
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,22 @@
|
||||
H
|
||||
10
|
||||
// 1
|
||||
0x0001
|
||||
// 2
|
||||
0x0002
|
||||
// 3
|
||||
0x0003
|
||||
// 4
|
||||
0x0004
|
||||
// 7
|
||||
0x0007
|
||||
// 8
|
||||
0x0008
|
||||
// 9
|
||||
0x0009
|
||||
// 15
|
||||
0x000F
|
||||
// 16
|
||||
0x0010
|
||||
// 2
|
||||
0x0002
|
||||
@ -0,0 +1,198 @@
|
||||
H
|
||||
98
|
||||
// 1
|
||||
0x0001
|
||||
// 1
|
||||
0x0001
|
||||
// 1
|
||||
0x0001
|
||||
// 2
|
||||
0x0002
|
||||
// 1
|
||||
0x0001
|
||||
// 3
|
||||
0x0003
|
||||
// 1
|
||||
0x0001
|
||||
// 4
|
||||
0x0004
|
||||
// 1
|
||||
0x0001
|
||||
// 7
|
||||
0x0007
|
||||
// 1
|
||||
0x0001
|
||||
// 16
|
||||
0x0010
|
||||
// 1
|
||||
0x0001
|
||||
// 23
|
||||
0x0017
|
||||
// 2
|
||||
0x0002
|
||||
// 1
|
||||
0x0001
|
||||
// 2
|
||||
0x0002
|
||||
// 2
|
||||
0x0002
|
||||
// 2
|
||||
0x0002
|
||||
// 3
|
||||
0x0003
|
||||
// 2
|
||||
0x0002
|
||||
// 4
|
||||
0x0004
|
||||
// 2
|
||||
0x0002
|
||||
// 7
|
||||
0x0007
|
||||
// 2
|
||||
0x0002
|
||||
// 16
|
||||
0x0010
|
||||
// 2
|
||||
0x0002
|
||||
// 23
|
||||
0x0017
|
||||
// 3
|
||||
0x0003
|
||||
// 1
|
||||
0x0001
|
||||
// 3
|
||||
0x0003
|
||||
// 2
|
||||
0x0002
|
||||
// 3
|
||||
0x0003
|
||||
// 3
|
||||
0x0003
|
||||
// 3
|
||||
0x0003
|
||||
// 4
|
||||
0x0004
|
||||
// 3
|
||||
0x0003
|
||||
// 7
|
||||
0x0007
|
||||
// 3
|
||||
0x0003
|
||||
// 16
|
||||
0x0010
|
||||
// 3
|
||||
0x0003
|
||||
// 23
|
||||
0x0017
|
||||
// 4
|
||||
0x0004
|
||||
// 1
|
||||
0x0001
|
||||
// 4
|
||||
0x0004
|
||||
// 2
|
||||
0x0002
|
||||
// 4
|
||||
0x0004
|
||||
// 3
|
||||
0x0003
|
||||
// 4
|
||||
0x0004
|
||||
// 4
|
||||
0x0004
|
||||
// 4
|
||||
0x0004
|
||||
// 7
|
||||
0x0007
|
||||
// 4
|
||||
0x0004
|
||||
// 16
|
||||
0x0010
|
||||
// 4
|
||||
0x0004
|
||||
// 23
|
||||
0x0017
|
||||
// 7
|
||||
0x0007
|
||||
// 1
|
||||
0x0001
|
||||
// 7
|
||||
0x0007
|
||||
// 2
|
||||
0x0002
|
||||
// 7
|
||||
0x0007
|
||||
// 3
|
||||
0x0003
|
||||
// 7
|
||||
0x0007
|
||||
// 4
|
||||
0x0004
|
||||
// 7
|
||||
0x0007
|
||||
// 7
|
||||
0x0007
|
||||
// 7
|
||||
0x0007
|
||||
// 16
|
||||
0x0010
|
||||
// 7
|
||||
0x0007
|
||||
// 23
|
||||
0x0017
|
||||
// 16
|
||||
0x0010
|
||||
// 1
|
||||
0x0001
|
||||
// 16
|
||||
0x0010
|
||||
// 2
|
||||
0x0002
|
||||
// 16
|
||||
0x0010
|
||||
// 3
|
||||
0x0003
|
||||
// 16
|
||||
0x0010
|
||||
// 4
|
||||
0x0004
|
||||
// 16
|
||||
0x0010
|
||||
// 7
|
||||
0x0007
|
||||
// 16
|
||||
0x0010
|
||||
// 16
|
||||
0x0010
|
||||
// 16
|
||||
0x0010
|
||||
// 23
|
||||
0x0017
|
||||
// 23
|
||||
0x0017
|
||||
// 1
|
||||
0x0001
|
||||
// 23
|
||||
0x0017
|
||||
// 2
|
||||
0x0002
|
||||
// 23
|
||||
0x0017
|
||||
// 3
|
||||
0x0003
|
||||
// 23
|
||||
0x0017
|
||||
// 4
|
||||
0x0004
|
||||
// 23
|
||||
0x0017
|
||||
// 7
|
||||
0x0007
|
||||
// 23
|
||||
0x0017
|
||||
// 16
|
||||
0x0010
|
||||
// 23
|
||||
0x0017
|
||||
// 23
|
||||
0x0017
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,96 @@
|
||||
H
|
||||
47
|
||||
// 1.000000
|
||||
0x3c00
|
||||
// -0.013554
|
||||
0xa2f1
|
||||
// -0.516418
|
||||
0xb822
|
||||
// 0.099856
|
||||
0x2e64
|
||||
// -0.063094
|
||||
0xac0a
|
||||
// 0.235283
|
||||
0x3387
|
||||
// 0.058222
|
||||
0x2b74
|
||||
// -0.778461
|
||||
0xba3a
|
||||
// -0.142255
|
||||
0xb08d
|
||||
// -0.098023
|
||||
0xae46
|
||||
// 0.251210
|
||||
0x3405
|
||||
// -0.101114
|
||||
0xae79
|
||||
// 0.148685
|
||||
0x30c2
|
||||
// -0.064970
|
||||
0xac28
|
||||
// 0.080360
|
||||
0x2d25
|
||||
// 0.510886
|
||||
0x3816
|
||||
// -0.317613
|
||||
0xb515
|
||||
// 0.317399
|
||||
0x3514
|
||||
// -0.015243
|
||||
0xa3ce
|
||||
// -0.029123
|
||||
0xa775
|
||||
// 0.284566
|
||||
0x348e
|
||||
// -0.134889
|
||||
0xb051
|
||||
// -0.064918
|
||||
0xac28
|
||||
// 0.183938
|
||||
0x31e3
|
||||
// -0.253136
|
||||
0xb40d
|
||||
// 0.267614
|
||||
0x3448
|
||||
// 0.170277
|
||||
0x3173
|
||||
// 0.530148
|
||||
0x383e
|
||||
// -0.340637
|
||||
0xb573
|
||||
// 0.875454
|
||||
0x3b01
|
||||
// -0.357078
|
||||
0xb5b7
|
||||
// 0.489558
|
||||
0x37d5
|
||||
// 0.039769
|
||||
0x2917
|
||||
// -0.692500
|
||||
0xb98a
|
||||
// 0.142047
|
||||
0x308c
|
||||
// 0.235809
|
||||
0x338c
|
||||
// 0.207209
|
||||
0x32a1
|
||||
// 0.331737
|
||||
0x354f
|
||||
// -0.032288
|
||||
0xa822
|
||||
// -0.126846
|
||||
0xb00f
|
||||
// -0.718843
|
||||
0xb9c0
|
||||
// 0.294611
|
||||
0x34b7
|
||||
// 0.361941
|
||||
0x35cb
|
||||
// 0.066997
|
||||
0x2c4a
|
||||
// -0.021069
|
||||
0xa565
|
||||
// -0.464241
|
||||
0xb76e
|
||||
// 0.007340
|
||||
0x1f84
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,786 @@
|
||||
H
|
||||
392
|
||||
// 0.065177
|
||||
0x2c2c
|
||||
// 0.073375
|
||||
0x2cb2
|
||||
// 0.075378
|
||||
0x2cd3
|
||||
// 0.091117
|
||||
0x2dd5
|
||||
// -0.040496
|
||||
0xa92f
|
||||
// 0.032118
|
||||
0x281c
|
||||
// 0.110453
|
||||
0x2f12
|
||||
// 0.065177
|
||||
0x2c2c
|
||||
// -0.604829
|
||||
0xb8d7
|
||||
// 0.073375
|
||||
0x2cb2
|
||||
// -0.006015
|
||||
0x9e29
|
||||
// 0.075378
|
||||
0x2cd3
|
||||
// 0.391532
|
||||
0x3644
|
||||
// 0.091117
|
||||
0x2dd5
|
||||
// 0.129558
|
||||
0x3025
|
||||
// -0.040496
|
||||
0xa92f
|
||||
// -0.285559
|
||||
0xb492
|
||||
// 0.032118
|
||||
0x281c
|
||||
// -0.478016
|
||||
0xb7a6
|
||||
// 0.110453
|
||||
0x2f12
|
||||
// 0.171094
|
||||
0x317a
|
||||
// 0.065177
|
||||
0x2c2c
|
||||
// -0.604829
|
||||
0xb8d7
|
||||
// -0.003879
|
||||
0x9bf2
|
||||
// 0.073375
|
||||
0x2cb2
|
||||
// -0.006015
|
||||
0x9e29
|
||||
// -0.201411
|
||||
0xb272
|
||||
// 0.075378
|
||||
0x2cd3
|
||||
// 0.391532
|
||||
0x3644
|
||||
// -0.602685
|
||||
0xb8d2
|
||||
// 0.091117
|
||||
0x2dd5
|
||||
// 0.129558
|
||||
0x3025
|
||||
// 0.043250
|
||||
0x2989
|
||||
// -0.040496
|
||||
0xa92f
|
||||
// -0.285559
|
||||
0xb492
|
||||
// -0.214891
|
||||
0xb2e0
|
||||
// 0.032118
|
||||
0x281c
|
||||
// -0.478016
|
||||
0xb7a6
|
||||
// 0.914179
|
||||
0x3b50
|
||||
// 0.110453
|
||||
0x2f12
|
||||
// 0.171094
|
||||
0x317a
|
||||
// -0.096979
|
||||
0xae35
|
||||
// 0.065177
|
||||
0x2c2c
|
||||
// -0.604829
|
||||
0xb8d7
|
||||
// -0.003879
|
||||
0x9bf2
|
||||
// 0.157615
|
||||
0x310b
|
||||
// 0.073375
|
||||
0x2cb2
|
||||
// -0.006015
|
||||
0x9e29
|
||||
// -0.201411
|
||||
0xb272
|
||||
// -0.674383
|
||||
0xb965
|
||||
// 0.075378
|
||||
0x2cd3
|
||||
// 0.391532
|
||||
0x3644
|
||||
// -0.602685
|
||||
0xb8d2
|
||||
// 0.241720
|
||||
0x33bc
|
||||
// 0.091117
|
||||
0x2dd5
|
||||
// 0.129558
|
||||
0x3025
|
||||
// 0.043250
|
||||
0x2989
|
||||
// -0.003618
|
||||
0x9b69
|
||||
// -0.040496
|
||||
0xa92f
|
||||
// -0.285559
|
||||
0xb492
|
||||
// -0.214891
|
||||
0xb2e0
|
||||
// -0.037703
|
||||
0xa8d3
|
||||
// 0.032118
|
||||
0x281c
|
||||
// -0.478016
|
||||
0xb7a6
|
||||
// 0.914179
|
||||
0x3b50
|
||||
// -0.400779
|
||||
0xb66a
|
||||
// 0.110453
|
||||
0x2f12
|
||||
// 0.171094
|
||||
0x317a
|
||||
// -0.096979
|
||||
0xae35
|
||||
// 0.228860
|
||||
0x3353
|
||||
// 0.065177
|
||||
0x2c2c
|
||||
// -0.604829
|
||||
0xb8d7
|
||||
// -0.003879
|
||||
0x9bf2
|
||||
// 0.157615
|
||||
0x310b
|
||||
// -0.207477
|
||||
0xb2a4
|
||||
// -0.447515
|
||||
0xb729
|
||||
// -0.676908
|
||||
0xb96a
|
||||
// 0.073375
|
||||
0x2cb2
|
||||
// -0.006015
|
||||
0x9e29
|
||||
// -0.201411
|
||||
0xb272
|
||||
// -0.674383
|
||||
0xb965
|
||||
// -0.140275
|
||||
0xb07d
|
||||
// -0.400734
|
||||
0xb669
|
||||
// -0.148252
|
||||
0xb0be
|
||||
// 0.075378
|
||||
0x2cd3
|
||||
// 0.391532
|
||||
0x3644
|
||||
// -0.602685
|
||||
0xb8d2
|
||||
// 0.241720
|
||||
0x33bc
|
||||
// -0.013486
|
||||
0xa2e8
|
||||
// 0.071506
|
||||
0x2c94
|
||||
// -0.421687
|
||||
0xb6bf
|
||||
// 0.091117
|
||||
0x2dd5
|
||||
// 0.129558
|
||||
0x3025
|
||||
// 0.043250
|
||||
0x2989
|
||||
// -0.003618
|
||||
0x9b69
|
||||
// 0.032091
|
||||
0x281c
|
||||
// -0.026998
|
||||
0xa6e9
|
||||
// 0.198279
|
||||
0x3258
|
||||
// -0.040496
|
||||
0xa92f
|
||||
// -0.285559
|
||||
0xb492
|
||||
// -0.214891
|
||||
0xb2e0
|
||||
// -0.037703
|
||||
0xa8d3
|
||||
// -0.455534
|
||||
0xb74a
|
||||
// -0.394148
|
||||
0xb64e
|
||||
// 0.121496
|
||||
0x2fc7
|
||||
// 0.032118
|
||||
0x281c
|
||||
// -0.478016
|
||||
0xb7a6
|
||||
// 0.914179
|
||||
0x3b50
|
||||
// -0.400779
|
||||
0xb66a
|
||||
// 0.071360
|
||||
0x2c91
|
||||
// 0.421843
|
||||
0x36c0
|
||||
// 0.675201
|
||||
0x3967
|
||||
// 0.110453
|
||||
0x2f12
|
||||
// 0.171094
|
||||
0x317a
|
||||
// -0.096979
|
||||
0xae35
|
||||
// 0.228860
|
||||
0x3353
|
||||
// -0.030445
|
||||
0xa7cb
|
||||
// -0.052441
|
||||
0xaab6
|
||||
// -0.262839
|
||||
0xb435
|
||||
// 0.065177
|
||||
0x2c2c
|
||||
// -0.604829
|
||||
0xb8d7
|
||||
// -0.003879
|
||||
0x9bf2
|
||||
// 0.157615
|
||||
0x310b
|
||||
// -0.207477
|
||||
0xb2a4
|
||||
// -0.447515
|
||||
0xb729
|
||||
// -0.676908
|
||||
0xb96a
|
||||
// -0.186246
|
||||
0xb1f6
|
||||
// -0.138838
|
||||
0xb071
|
||||
// 0.105986
|
||||
0x2ec8
|
||||
// -0.404153
|
||||
0xb677
|
||||
// -0.252231
|
||||
0xb409
|
||||
// -0.145413
|
||||
0xb0a7
|
||||
// 0.209444
|
||||
0x32b4
|
||||
// -0.260962
|
||||
0xb42d
|
||||
// 0.098829
|
||||
0x2e53
|
||||
// 0.073375
|
||||
0x2cb2
|
||||
// -0.006015
|
||||
0x9e29
|
||||
// -0.201411
|
||||
0xb272
|
||||
// -0.674383
|
||||
0xb965
|
||||
// -0.140275
|
||||
0xb07d
|
||||
// -0.400734
|
||||
0xb669
|
||||
// -0.148252
|
||||
0xb0be
|
||||
// -0.262301
|
||||
0xb432
|
||||
// -0.163921
|
||||
0xb13f
|
||||
// -0.432204
|
||||
0xb6ea
|
||||
// -0.019905
|
||||
0xa518
|
||||
// 0.081151
|
||||
0x2d32
|
||||
// 0.453349
|
||||
0x3741
|
||||
// 0.588064
|
||||
0x38b4
|
||||
// -0.420738
|
||||
0xb6bb
|
||||
// 0.103660
|
||||
0x2ea2
|
||||
// 0.075378
|
||||
0x2cd3
|
||||
// 0.391532
|
||||
0x3644
|
||||
// -0.602685
|
||||
0xb8d2
|
||||
// 0.241720
|
||||
0x33bc
|
||||
// -0.013486
|
||||
0xa2e8
|
||||
// 0.071506
|
||||
0x2c94
|
||||
// -0.421687
|
||||
0xb6bf
|
||||
// -0.228767
|
||||
0xb352
|
||||
// 0.145997
|
||||
0x30ac
|
||||
// 0.676217
|
||||
0x3969
|
||||
// -0.097938
|
||||
0xae45
|
||||
// 0.477692
|
||||
0x37a5
|
||||
// -0.176282
|
||||
0xb1a4
|
||||
// -0.409477
|
||||
0xb68d
|
||||
// 0.034974
|
||||
0x287a
|
||||
// -0.103844
|
||||
0xaea5
|
||||
// 0.091117
|
||||
0x2dd5
|
||||
// 0.129558
|
||||
0x3025
|
||||
// 0.043250
|
||||
0x2989
|
||||
// -0.003618
|
||||
0x9b69
|
||||
// 0.032091
|
||||
0x281c
|
||||
// -0.026998
|
||||
0xa6e9
|
||||
// 0.198279
|
||||
0x3258
|
||||
// -0.505518
|
||||
0xb80b
|
||||
// 0.455219
|
||||
0x3749
|
||||
// -0.204145
|
||||
0xb288
|
||||
// 0.230497
|
||||
0x3360
|
||||
// -0.055522
|
||||
0xab1b
|
||||
// -0.576398
|
||||
0xb89c
|
||||
// 0.330282
|
||||
0x3549
|
||||
// -0.262504
|
||||
0xb433
|
||||
// 0.487315
|
||||
0x37cc
|
||||
// -0.040496
|
||||
0xa92f
|
||||
// -0.285559
|
||||
0xb492
|
||||
// -0.214891
|
||||
0xb2e0
|
||||
// -0.037703
|
||||
0xa8d3
|
||||
// -0.455534
|
||||
0xb74a
|
||||
// -0.394148
|
||||
0xb64e
|
||||
// 0.121496
|
||||
0x2fc7
|
||||
// 0.384751
|
||||
0x3628
|
||||
// -0.289029
|
||||
0xb4a0
|
||||
// -0.389529
|
||||
0xb63c
|
||||
// -0.421876
|
||||
0xb6c0
|
||||
// 0.290180
|
||||
0x34a5
|
||||
// 0.185524
|
||||
0x31f0
|
||||
// 0.789968
|
||||
0x3a52
|
||||
// -0.148520
|
||||
0xb0c1
|
||||
// 0.123930
|
||||
0x2fee
|
||||
// 0.032118
|
||||
0x281c
|
||||
// -0.478016
|
||||
0xb7a6
|
||||
// 0.914179
|
||||
0x3b50
|
||||
// -0.400779
|
||||
0xb66a
|
||||
// 0.071360
|
||||
0x2c91
|
||||
// 0.421843
|
||||
0x36c0
|
||||
// 0.675201
|
||||
0x3967
|
||||
// 0.011714
|
||||
0x21ff
|
||||
// -0.307025
|
||||
0xb4ea
|
||||
// 0.725770
|
||||
0x39ce
|
||||
// -0.346869
|
||||
0xb58d
|
||||
// 0.300160
|
||||
0x34cd
|
||||
// -0.156293
|
||||
0xb100
|
||||
// -0.434623
|
||||
0xb6f4
|
||||
// 0.765252
|
||||
0x3a1f
|
||||
// 0.247563
|
||||
0x33ec
|
||||
// 0.110453
|
||||
0x2f12
|
||||
// 0.171094
|
||||
0x317a
|
||||
// -0.096979
|
||||
0xae35
|
||||
// 0.228860
|
||||
0x3353
|
||||
// -0.030445
|
||||
0xa7cb
|
||||
// -0.052441
|
||||
0xaab6
|
||||
// -0.262839
|
||||
0xb435
|
||||
// 0.294711
|
||||
0x34b7
|
||||
// -0.363569
|
||||
0xb5d1
|
||||
// 0.107500
|
||||
0x2ee1
|
||||
// 0.506859
|
||||
0x380e
|
||||
// 0.054092
|
||||
0x2aec
|
||||
// 0.599363
|
||||
0x38cb
|
||||
// -0.368742
|
||||
0xb5e6
|
||||
// -0.781094
|
||||
0xba40
|
||||
// -0.065162
|
||||
0xac2c
|
||||
// 0.065177
|
||||
0x2c2c
|
||||
// -0.604829
|
||||
0xb8d7
|
||||
// -0.003879
|
||||
0x9bf2
|
||||
// 0.157615
|
||||
0x310b
|
||||
// -0.207477
|
||||
0xb2a4
|
||||
// -0.447515
|
||||
0xb729
|
||||
// -0.676908
|
||||
0xb96a
|
||||
// -0.186246
|
||||
0xb1f6
|
||||
// -0.138838
|
||||
0xb071
|
||||
// 0.105986
|
||||
0x2ec8
|
||||
// -0.404153
|
||||
0xb677
|
||||
// -0.252231
|
||||
0xb409
|
||||
// -0.145413
|
||||
0xb0a7
|
||||
// 0.209444
|
||||
0x32b4
|
||||
// -0.260962
|
||||
0xb42d
|
||||
// 0.098829
|
||||
0x2e53
|
||||
// -0.163146
|
||||
0xb138
|
||||
// 0.057192
|
||||
0x2b52
|
||||
// -0.436173
|
||||
0xb6fb
|
||||
// -0.292779
|
||||
0xb4af
|
||||
// -0.020365
|
||||
0xa537
|
||||
// -0.033979
|
||||
0xa859
|
||||
// 0.086233
|
||||
0x2d85
|
||||
// 0.073375
|
||||
0x2cb2
|
||||
// -0.006015
|
||||
0x9e29
|
||||
// -0.201411
|
||||
0xb272
|
||||
// -0.674383
|
||||
0xb965
|
||||
// -0.140275
|
||||
0xb07d
|
||||
// -0.400734
|
||||
0xb669
|
||||
// -0.148252
|
||||
0xb0be
|
||||
// -0.262301
|
||||
0xb432
|
||||
// -0.163921
|
||||
0xb13f
|
||||
// -0.432204
|
||||
0xb6ea
|
||||
// -0.019905
|
||||
0xa518
|
||||
// 0.081151
|
||||
0x2d32
|
||||
// 0.453349
|
||||
0x3741
|
||||
// 0.588064
|
||||
0x38b4
|
||||
// -0.420738
|
||||
0xb6bb
|
||||
// 0.103660
|
||||
0x2ea2
|
||||
// 0.385484
|
||||
0x362b
|
||||
// -0.174786
|
||||
0xb198
|
||||
// -0.116991
|
||||
0xaf7d
|
||||
// 0.118594
|
||||
0x2f97
|
||||
// 0.258199
|
||||
0x3422
|
||||
// -0.082567
|
||||
0xad49
|
||||
// -0.228460
|
||||
0xb350
|
||||
// 0.075378
|
||||
0x2cd3
|
||||
// 0.391532
|
||||
0x3644
|
||||
// -0.602685
|
||||
0xb8d2
|
||||
// 0.241720
|
||||
0x33bc
|
||||
// -0.013486
|
||||
0xa2e8
|
||||
// 0.071506
|
||||
0x2c94
|
||||
// -0.421687
|
||||
0xb6bf
|
||||
// -0.228767
|
||||
0xb352
|
||||
// 0.145997
|
||||
0x30ac
|
||||
// 0.676217
|
||||
0x3969
|
||||
// -0.097938
|
||||
0xae45
|
||||
// 0.477692
|
||||
0x37a5
|
||||
// -0.176282
|
||||
0xb1a4
|
||||
// -0.409477
|
||||
0xb68d
|
||||
// 0.034974
|
||||
0x287a
|
||||
// -0.103844
|
||||
0xaea5
|
||||
// -0.499133
|
||||
0xb7fc
|
||||
// -0.829460
|
||||
0xbaa3
|
||||
// -0.182757
|
||||
0xb1d9
|
||||
// -0.506883
|
||||
0xb80e
|
||||
// 0.522298
|
||||
0x382e
|
||||
// -0.522191
|
||||
0xb82d
|
||||
// 0.065033
|
||||
0x2c29
|
||||
// 0.091117
|
||||
0x2dd5
|
||||
// 0.129558
|
||||
0x3025
|
||||
// 0.043250
|
||||
0x2989
|
||||
// -0.003618
|
||||
0x9b69
|
||||
// 0.032091
|
||||
0x281c
|
||||
// -0.026998
|
||||
0xa6e9
|
||||
// 0.198279
|
||||
0x3258
|
||||
// -0.505518
|
||||
0xb80b
|
||||
// 0.455219
|
||||
0x3749
|
||||
// -0.204145
|
||||
0xb288
|
||||
// 0.230497
|
||||
0x3360
|
||||
// -0.055522
|
||||
0xab1b
|
||||
// -0.576398
|
||||
0xb89c
|
||||
// 0.330282
|
||||
0x3549
|
||||
// -0.262504
|
||||
0xb433
|
||||
// 0.487315
|
||||
0x37cc
|
||||
// -0.240474
|
||||
0xb3b2
|
||||
// 0.345253
|
||||
0x3586
|
||||
// 0.156623
|
||||
0x3103
|
||||
// 0.317572
|
||||
0x3515
|
||||
// 0.295017
|
||||
0x34b8
|
||||
// 0.130245
|
||||
0x302b
|
||||
// -0.217621
|
||||
0xb2f7
|
||||
// -0.040496
|
||||
0xa92f
|
||||
// -0.285559
|
||||
0xb492
|
||||
// -0.214891
|
||||
0xb2e0
|
||||
// -0.037703
|
||||
0xa8d3
|
||||
// -0.455534
|
||||
0xb74a
|
||||
// -0.394148
|
||||
0xb64e
|
||||
// 0.121496
|
||||
0x2fc7
|
||||
// 0.384751
|
||||
0x3628
|
||||
// -0.289029
|
||||
0xb4a0
|
||||
// -0.389529
|
||||
0xb63c
|
||||
// -0.421876
|
||||
0xb6c0
|
||||
// 0.290180
|
||||
0x34a5
|
||||
// 0.185524
|
||||
0x31f0
|
||||
// 0.789968
|
||||
0x3a52
|
||||
// -0.148520
|
||||
0xb0c1
|
||||
// 0.123930
|
||||
0x2fee
|
||||
// 0.168120
|
||||
0x3161
|
||||
// -0.101402
|
||||
0xae7d
|
||||
// -0.358987
|
||||
0xb5be
|
||||
// -0.749984
|
||||
0xba00
|
||||
// -0.323443
|
||||
0xb52d
|
||||
// 0.084039
|
||||
0x2d61
|
||||
// -0.622083
|
||||
0xb8fa
|
||||
// 0.032118
|
||||
0x281c
|
||||
// -0.478016
|
||||
0xb7a6
|
||||
// 0.914179
|
||||
0x3b50
|
||||
// -0.400779
|
||||
0xb66a
|
||||
// 0.071360
|
||||
0x2c91
|
||||
// 0.421843
|
||||
0x36c0
|
||||
// 0.675201
|
||||
0x3967
|
||||
// 0.011714
|
||||
0x21ff
|
||||
// -0.307025
|
||||
0xb4ea
|
||||
// 0.725770
|
||||
0x39ce
|
||||
// -0.346869
|
||||
0xb58d
|
||||
// 0.300160
|
||||
0x34cd
|
||||
// -0.156293
|
||||
0xb100
|
||||
// -0.434623
|
||||
0xb6f4
|
||||
// 0.765252
|
||||
0x3a1f
|
||||
// 0.247563
|
||||
0x33ec
|
||||
// -0.072272
|
||||
0xaca0
|
||||
// 0.362537
|
||||
0x35cd
|
||||
// -0.229903
|
||||
0xb35b
|
||||
// 0.161554
|
||||
0x312b
|
||||
// 0.311479
|
||||
0x34fc
|
||||
// -0.004970
|
||||
0x9d17
|
||||
// 0.198138
|
||||
0x3257
|
||||
// 0.110453
|
||||
0x2f12
|
||||
// 0.171094
|
||||
0x317a
|
||||
// -0.096979
|
||||
0xae35
|
||||
// 0.228860
|
||||
0x3353
|
||||
// -0.030445
|
||||
0xa7cb
|
||||
// -0.052441
|
||||
0xaab6
|
||||
// -0.262839
|
||||
0xb435
|
||||
// 0.294711
|
||||
0x34b7
|
||||
// -0.363569
|
||||
0xb5d1
|
||||
// 0.107500
|
||||
0x2ee1
|
||||
// 0.506859
|
||||
0x380e
|
||||
// 0.054092
|
||||
0x2aec
|
||||
// 0.599363
|
||||
0x38cb
|
||||
// -0.368742
|
||||
0xb5e6
|
||||
// -0.781094
|
||||
0xba40
|
||||
// -0.065162
|
||||
0xac2c
|
||||
// 0.031448
|
||||
0x2806
|
||||
// 0.088655
|
||||
0x2dad
|
||||
// -0.035384
|
||||
0xa887
|
||||
// -1.267741
|
||||
0xbd12
|
||||
// 0.424117
|
||||
0x36c9
|
||||
// 0.327717
|
||||
0x353e
|
||||
// 0.660219
|
||||
0x3948
|
||||
@ -0,0 +1,152 @@
|
||||
#include "BinaryTestsF16.h"
|
||||
#include <stdio.h>
|
||||
#include "Error.h"
|
||||
|
||||
#define SNR_THRESHOLD 60
|
||||
|
||||
/*
|
||||
|
||||
Reference patterns are generated with
|
||||
a double precision computation.
|
||||
|
||||
*/
|
||||
#define REL_ERROR (2.0e-3)
|
||||
#define ABS_ERROR (2.0e-3)
|
||||
|
||||
|
||||
/* Upper bound of maximum matrix dimension used by Python */
|
||||
#define MAXMATRIXDIM 40
|
||||
|
||||
|
||||
#define LOADDATA2() \
|
||||
const float16_t *inp1=input1.ptr(); \
|
||||
const float16_t *inp2=input2.ptr(); \
|
||||
\
|
||||
float16_t *ap=a.ptr(); \
|
||||
float16_t *bp=b.ptr(); \
|
||||
\
|
||||
float16_t *outp=output.ptr(); \
|
||||
int16_t *dimsp = dims.ptr(); \
|
||||
int nbMatrixes = dims.nbSamples() / 3;\
|
||||
int rows,internal,columns; \
|
||||
int i;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define PREPAREDATA2() \
|
||||
in1.numRows=rows; \
|
||||
in1.numCols=internal; \
|
||||
memcpy((void*)ap,(const void*)inp1,2*sizeof(float16_t)*rows*internal);\
|
||||
in1.pData = ap; \
|
||||
\
|
||||
in2.numRows=internal; \
|
||||
in2.numCols=columns; \
|
||||
memcpy((void*)bp,(const void*)inp2,2*sizeof(float16_t)*internal*columns);\
|
||||
in2.pData = bp; \
|
||||
\
|
||||
out.numRows=rows; \
|
||||
out.numCols=columns; \
|
||||
out.pData = outp;
|
||||
|
||||
|
||||
|
||||
void BinaryTestsF16::test_mat_mult_f16()
|
||||
{
|
||||
LOADDATA2();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
internal = *dimsp++;
|
||||
columns = *dimsp++;
|
||||
|
||||
PREPAREDATA2();
|
||||
|
||||
arm_mat_mult_f16(&this->in1,&this->in2,&this->out);
|
||||
|
||||
outp += (rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
void BinaryTestsF16::test_mat_cmplx_mult_f16()
|
||||
{
|
||||
LOADDATA2();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
internal = *dimsp++;
|
||||
columns = *dimsp++;
|
||||
|
||||
PREPAREDATA2();
|
||||
|
||||
arm_mat_cmplx_mult_f16(&this->in1,&this->in2,&this->out);
|
||||
|
||||
outp += (2*rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void BinaryTestsF16::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
|
||||
{
|
||||
|
||||
|
||||
(void)params;
|
||||
switch(id)
|
||||
{
|
||||
case TEST_MAT_MULT_F16_1:
|
||||
input1.reload(BinaryTestsF16::INPUTS1_F16_ID,mgr);
|
||||
input2.reload(BinaryTestsF16::INPUTS2_F16_ID,mgr);
|
||||
dims.reload(BinaryTestsF16::DIMSBINARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(BinaryTestsF16::REFMUL1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),BinaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPA_F16_ID,mgr);
|
||||
b.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPB_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
case TEST_MAT_CMPLX_MULT_F16_2:
|
||||
input1.reload(BinaryTestsF16::INPUTSC1_F16_ID,mgr);
|
||||
input2.reload(BinaryTestsF16::INPUTSC2_F16_ID,mgr);
|
||||
dims.reload(BinaryTestsF16::DIMSBINARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(BinaryTestsF16::REFCMPLXMUL1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),BinaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPA_F16_ID,mgr);
|
||||
b.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPB_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
void BinaryTestsF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
|
||||
{
|
||||
(void)id;
|
||||
output.dump(mgr);
|
||||
}
|
||||
@ -0,0 +1,408 @@
|
||||
#include "UnaryTestsF16.h"
|
||||
#include <stdio.h>
|
||||
#include "Error.h"
|
||||
|
||||
#define SNR_THRESHOLD 60
|
||||
|
||||
/*
|
||||
|
||||
Reference patterns are generated with
|
||||
a double precision computation.
|
||||
|
||||
*/
|
||||
#define REL_ERROR (1.1e-3)
|
||||
#define ABS_ERROR (1.1e-3)
|
||||
|
||||
/*
|
||||
|
||||
Comparisons for inverse
|
||||
|
||||
*/
|
||||
|
||||
/* Not very accurate for big matrix.
|
||||
But big matrix needed for checking the vectorized code */
|
||||
|
||||
#define SNR_THRESHOLD_INV 45
|
||||
#define REL_ERROR_INV (3.0e-2)
|
||||
#define ABS_ERROR_INV (3.0e-2)
|
||||
|
||||
/* Upper bound of maximum matrix dimension used by Python */
|
||||
#define MAXMATRIXDIM 40
|
||||
|
||||
#define LOADDATA2() \
|
||||
const float16_t *inp1=input1.ptr(); \
|
||||
const float16_t *inp2=input2.ptr(); \
|
||||
\
|
||||
float16_t *ap=a.ptr(); \
|
||||
float16_t *bp=b.ptr(); \
|
||||
\
|
||||
float16_t *outp=output.ptr(); \
|
||||
int16_t *dimsp = dims.ptr(); \
|
||||
int nbMatrixes = dims.nbSamples() >> 1;\
|
||||
int rows,columns; \
|
||||
int i;
|
||||
|
||||
#define LOADDATA1() \
|
||||
const float16_t *inp1=input1.ptr(); \
|
||||
\
|
||||
float16_t *ap=a.ptr(); \
|
||||
\
|
||||
float16_t *outp=output.ptr(); \
|
||||
int16_t *dimsp = dims.ptr(); \
|
||||
int nbMatrixes = dims.nbSamples() >> 1;\
|
||||
int rows,columns; \
|
||||
int i;
|
||||
|
||||
#define PREPAREDATA2() \
|
||||
in1.numRows=rows; \
|
||||
in1.numCols=columns; \
|
||||
memcpy((void*)ap,(const void*)inp1,sizeof(float16_t)*rows*columns);\
|
||||
in1.pData = ap; \
|
||||
\
|
||||
in2.numRows=rows; \
|
||||
in2.numCols=columns; \
|
||||
memcpy((void*)bp,(const void*)inp2,sizeof(float16_t)*rows*columns);\
|
||||
in2.pData = bp; \
|
||||
\
|
||||
out.numRows=rows; \
|
||||
out.numCols=columns; \
|
||||
out.pData = outp;
|
||||
|
||||
#define PREPAREDATA1(TRANSPOSED) \
|
||||
in1.numRows=rows; \
|
||||
in1.numCols=columns; \
|
||||
memcpy((void*)ap,(const void*)inp1,sizeof(float16_t)*rows*columns);\
|
||||
in1.pData = ap; \
|
||||
\
|
||||
if (TRANSPOSED) \
|
||||
{ \
|
||||
out.numRows=columns; \
|
||||
out.numCols=rows; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
out.numRows=rows; \
|
||||
out.numCols=columns; \
|
||||
} \
|
||||
out.pData = outp;
|
||||
|
||||
#define PREPAREDATA1C(TRANSPOSED) \
|
||||
in1.numRows=rows; \
|
||||
in1.numCols=columns; \
|
||||
memcpy((void*)ap,(const void*)inp1,2*sizeof(float16_t)*rows*columns);\
|
||||
in1.pData = ap; \
|
||||
\
|
||||
if (TRANSPOSED) \
|
||||
{ \
|
||||
out.numRows=columns; \
|
||||
out.numCols=rows; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
out.numRows=rows; \
|
||||
out.numCols=columns; \
|
||||
} \
|
||||
out.pData = outp;
|
||||
|
||||
#define LOADVECDATA2() \
|
||||
const float16_t *inp1=input1.ptr(); \
|
||||
const float16_t *inp2=input2.ptr(); \
|
||||
\
|
||||
float16_t *ap=a.ptr(); \
|
||||
float16_t *bp=b.ptr(); \
|
||||
\
|
||||
float16_t *outp=output.ptr(); \
|
||||
int16_t *dimsp = dims.ptr(); \
|
||||
int nbMatrixes = dims.nbSamples() / 2;\
|
||||
int rows,internal; \
|
||||
int i;
|
||||
|
||||
#define PREPAREVECDATA2() \
|
||||
in1.numRows=rows; \
|
||||
in1.numCols=internal; \
|
||||
memcpy((void*)ap,(const void*)inp1,2*sizeof(float16_t)*rows*internal);\
|
||||
in1.pData = ap; \
|
||||
\
|
||||
memcpy((void*)bp,(const void*)inp2,2*sizeof(float16_t)*internal);
|
||||
|
||||
|
||||
|
||||
void UnaryTestsF16::test_mat_vec_mult_f16()
|
||||
{
|
||||
LOADVECDATA2();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
internal = *dimsp++;
|
||||
|
||||
PREPAREVECDATA2();
|
||||
|
||||
arm_mat_vec_mult_f16(&this->in1, bp, outp);
|
||||
|
||||
outp += rows ;
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::test_mat_add_f16()
|
||||
{
|
||||
LOADDATA2();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
columns = *dimsp++;
|
||||
|
||||
PREPAREDATA2();
|
||||
|
||||
arm_mat_add_f16(&this->in1,&this->in2,&this->out);
|
||||
|
||||
outp += (rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::test_mat_sub_f16()
|
||||
{
|
||||
LOADDATA2();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
columns = *dimsp++;
|
||||
|
||||
PREPAREDATA2();
|
||||
|
||||
arm_mat_sub_f16(&this->in1,&this->in2,&this->out);
|
||||
|
||||
outp += (rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::test_mat_scale_f16()
|
||||
{
|
||||
LOADDATA1();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
columns = *dimsp++;
|
||||
|
||||
PREPAREDATA1(false);
|
||||
|
||||
arm_mat_scale_f16(&this->in1,0.5f,&this->out);
|
||||
|
||||
outp += (rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::test_mat_trans_f16()
|
||||
{
|
||||
LOADDATA1();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
columns = *dimsp++;
|
||||
|
||||
PREPAREDATA1(true);
|
||||
|
||||
arm_mat_trans_f16(&this->in1,&this->out);
|
||||
|
||||
outp += (rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::test_mat_cmplx_trans_f16()
|
||||
{
|
||||
LOADDATA1();
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
columns = *dimsp++;
|
||||
|
||||
PREPAREDATA1C(true);
|
||||
|
||||
arm_mat_cmplx_trans_f16(&this->in1,&this->out);
|
||||
|
||||
outp += 2*(rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::test_mat_inverse_f16()
|
||||
{
|
||||
const float16_t *inp1=input1.ptr();
|
||||
|
||||
float16_t *ap=a.ptr();
|
||||
|
||||
float16_t *outp=output.ptr();
|
||||
int16_t *dimsp = dims.ptr();
|
||||
int nbMatrixes = dims.nbSamples();
|
||||
int rows,columns;
|
||||
int i;
|
||||
arm_status status;
|
||||
|
||||
for(i=0;i < nbMatrixes ; i ++)
|
||||
{
|
||||
rows = *dimsp++;
|
||||
columns = rows;
|
||||
|
||||
PREPAREDATA1(false);
|
||||
|
||||
status=arm_mat_inverse_f16(&this->in1,&this->out);
|
||||
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
|
||||
|
||||
outp += (rows * columns);
|
||||
inp1 += (rows * columns);
|
||||
|
||||
}
|
||||
|
||||
ASSERT_EMPTY_TAIL(output);
|
||||
|
||||
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD_INV);
|
||||
|
||||
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR_INV,REL_ERROR_INV);
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
|
||||
{
|
||||
|
||||
|
||||
(void)params;
|
||||
switch(id)
|
||||
{
|
||||
case TEST_MAT_ADD_F16_1:
|
||||
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
|
||||
input2.reload(UnaryTestsF16::INPUTS2_F16_ID,mgr);
|
||||
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(UnaryTestsF16::REFADD1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
|
||||
b.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPB_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
case TEST_MAT_SUB_F16_2:
|
||||
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
|
||||
input2.reload(UnaryTestsF16::INPUTS2_F16_ID,mgr);
|
||||
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(UnaryTestsF16::REFSUB1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
|
||||
b.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPB_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
case TEST_MAT_SCALE_F16_3:
|
||||
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
|
||||
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(UnaryTestsF16::REFSCALE1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
case TEST_MAT_TRANS_F16_4:
|
||||
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
|
||||
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(UnaryTestsF16::REFTRANS1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
case TEST_MAT_INVERSE_F16_5:
|
||||
input1.reload(UnaryTestsF16::INPUTSINV_F16_ID,mgr);
|
||||
dims.reload(UnaryTestsF16::DIMSINVERT1_S16_ID,mgr);
|
||||
|
||||
ref.reload(UnaryTestsF16::REFINV1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
case TEST_MAT_VEC_MULT_F16_6:
|
||||
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
|
||||
input2.reload(UnaryTestsF16::INPUTVEC1_F16_ID,mgr);
|
||||
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(UnaryTestsF16::REFVECMUL1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
|
||||
b.create(MAXMATRIXDIM,UnaryTestsF16::TMPB_F16_ID,mgr);
|
||||
break;
|
||||
|
||||
case TEST_MAT_CMPLX_TRANS_F16_7:
|
||||
input1.reload(UnaryTestsF16::INPUTSC1_F16_ID,mgr);
|
||||
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
|
||||
|
||||
ref.reload(UnaryTestsF16::REFTRANSC1_F16_ID,mgr);
|
||||
|
||||
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
|
||||
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
void UnaryTestsF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
|
||||
{
|
||||
(void)id;
|
||||
output.dump(mgr);
|
||||
}
|
||||
Loading…
Reference in New Issue