CMSIS-DSP: Added f16 matrix functions

pull/19/head
Christophe Favergeon 6 years ago
parent 7d79ffa51f
commit d2d691cc23

@ -145,6 +145,18 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
Im = vgetq_lane(vecOut, 5); \
}
__STATIC_FORCEINLINE void mve_cmplx_sum_intra_vec_f16(
float16x8_t vecIn,
float16_t *pOut)
{
float16x8_t vecOut = __mve_cmplx_sum_intra_vec_f16(vecIn);
/*
* Cmplx sum is in 4rd & 5th f16 elt
* use 32-bit extraction
*/
*(float32_t *) pOut = ((float32x4_t) vecOut)[2];
}
#define INVSQRT_MAGIC_F16 0x59ba /* ( 0x1ba = 0x3759df >> 13) */
#define INV_NEWTON_INIT_F16 0x7773
@ -167,17 +179,10 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
/***************************************
Definitions available for MVEI only
Definitions available for MVEI and MVEF only
***************************************/
#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
#include "arm_common_tables.h"
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
#define MVE_ASRL_SAT32(acc, shift) ((sqrshrl(acc, -(32-shift)) >> 32) & 0xffffffff)
#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
/* Following functions are used to transpose matrix in f32 and q31 cases */
__STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
uint32_t * pDataSrc,
@ -381,6 +386,125 @@ __STATIC_INLINE arm_status arm_mat_cmplx_trans_32bit(
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
{
pDataDest[0] = pDataSrc[0];
pDataDest[3] = pDataSrc[3];
pDataDest[2] = pDataSrc[1];
pDataDest[1] = pDataSrc[2];
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
uint16x8_t vecOffs1;
uint16x8_t vecIn1;
/*
*
* | 0 1 2 | | 0 3 6 | 8 x 16 flattened version | 0 3 6 1 4 7 2 5 |
* | 3 4 5 | => | 1 4 7 | => | 8 . . . . . . . |
* | 6 7 8 | | 2 5 8 | (row major)
*
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
pDataDest[8] = pDataSrc[8];
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
uint16x8_t vecOffs1, vecOffs2;
uint16x8_t vecIn1, vecIn2;
uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
/*
* 4x4 Matrix transposition
*
* | 0 1 2 3 | | 0 4 8 12 | 8 x 16 flattened version
* | 4 5 6 7 | => | 1 5 9 13 | => [0 4 8 12 1 5 9 13]
* | 8 9 10 11 | | 2 6 10 14 | [2 6 10 14 3 7 11 15]
* | 12 13 14 15 | | 3 7 11 15 |
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
vecIn1 = vldrhq_u16(pDataSrcVec);
pDataSrcVec += 8;
vecIn2 = vldrhq_u16(pDataSrcVec);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_trans_16bit_generic(
uint16_t srcRows,
uint16_t srcCols,
uint16_t * pDataSrc,
uint16_t * pDataDest)
{
uint16x8_t vecOffs;
uint32_t i;
uint32_t blkCnt;
uint16_t const *pDataC;
uint16_t *pDataDestR;
uint16x8_t vecIn;
vecOffs = vidupq_u16((uint32_t)0, 1);
vecOffs = vecOffs * srcCols;
i = srcCols;
while(i > 0U)
{
pDataC = (uint16_t const *) pDataSrc;
pDataDestR = pDataDest;
blkCnt = srcRows >> 3;
while (blkCnt > 0U)
{
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_u16(pDataDestR, vecIn);
pDataDestR += 8;
pDataC = pDataC + srcCols * 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = srcRows & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_p_u16(pDataDestR, vecIn, p0);
}
pDataSrc += 1;
pDataDest += srcRows;
i--;
}
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_cmplx_trans_16bit(
uint16_t srcRows,
uint16_t srcCols,
@ -466,6 +590,20 @@ __STATIC_INLINE arm_status arm_mat_cmplx_trans_16bit(
return (ARM_MATH_SUCCESS);
}
#endif /* MVEF and MVEI */
/***************************************
Definitions available for MVEI only
***************************************/
#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
#include "arm_common_tables.h"
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
#define MVE_ASRL_SAT32(acc, shift) ((sqrshrl(acc, -(32-shift)) >> 32) & 0xffffffff)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
__STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)

@ -31,7 +31,150 @@ extern "C"
{
#endif
#include "arm_math_types_f16.h"
#include "arm_math_memory.h"
#include "dsp/none.h"
#include "dsp/utils.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
* @brief Instance structure for the floating-point matrix structure.
*/
typedef struct
{
uint16_t numRows; /**< number of rows of the matrix. */
uint16_t numCols; /**< number of columns of the matrix. */
float16_t *pData; /**< points to the data of the matrix. */
} arm_matrix_instance_f16;
/**
* @brief Floating-point matrix addition.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_add_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst);
/**
* @brief Floating-point, complex, matrix multiplication.
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst);
/**
* @brief Floating-point matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst);
/**
* @brief Floating-point complex matrix transpose.
* @param[in] pSrc points to the input matrix
* @param[out] pDst points to the output matrix
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_cmplx_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst);
/**
* @brief Floating-point matrix multiplication
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst);
/**
* @brief Floating-point matrix and vector multiplication
* @param[in] pSrcMat points to the input matrix structure
* @param[in] pVec points to vector
* @param[out] pDst points to output vector
*/
void arm_mat_vec_mult_f16(
const arm_matrix_instance_f16 *pSrcMat,
const float16_t *pVec,
float16_t *pDst);
/**
* @brief Floating-point matrix subtraction
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_sub_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst);
/**
* @brief Floating-point matrix scaling.
* @param[in] pSrc points to the input matrix
* @param[in] scale scale factor
* @param[out] pDst points to the output matrix
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_scale_f16(
const arm_matrix_instance_f16 * pSrc,
float16_t scale,
arm_matrix_instance_f16 * pDst);
/**
* @brief Floating-point matrix initialization.
* @param[in,out] S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] pData points to the matrix data array.
*/
void arm_mat_init_f16(
arm_matrix_instance_f16 * S,
uint16_t nRows,
uint16_t nColumns,
float16_t * pData);
/**
* @brief Floating-point matrix inverse.
* @param[in] src points to the instance of the input floating-point matrix structure.
* @param[out] dst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
*/
arm_status arm_mat_inverse_f16(
const arm_matrix_instance_f16 * src,
arm_matrix_instance_f16 * dst);
#endif /*defined(ARM_FLOAT16_SUPPORTED)*/
#ifdef __cplusplus
}

@ -34,6 +34,7 @@ filtering.remove(os.path.join(ROOT,"Source","FilteringFunctions","FilteringFunct
matrix = glob.glob(os.path.join(ROOT,"Source","MatrixFunctions","*.c"))
matrix.remove(os.path.join(ROOT,"Source","MatrixFunctions","MatrixFunctions.c"))
matrix.remove(os.path.join(ROOT,"Source","MatrixFunctions","MatrixFunctionsF16.c"))
statistics = glob.glob(os.path.join(ROOT,"Source","StatisticsFunctions","*.c"))
statistics.remove(os.path.join(ROOT,"Source","StatisticsFunctions","StatisticsFunctions.c"))

@ -38,4 +38,18 @@ configDsp(CMSISDSPMatrix ${ROOT})
target_include_directories(CMSISDSPMatrix PUBLIC "${DSP}/Include")
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPMatrix PRIVATE arm_mat_add_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_sub_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_trans_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_scale_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_vec_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cmplx_trans_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cmplx_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_inverse_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_init_f16.c)
endif()

@ -0,0 +1,40 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function f16 source files.
*
* $Date: 18. March 2020
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f16.c"
#include "arm_mat_sub_f16.c"
#include "arm_mat_trans_f16.c"
#include "arm_mat_scale_f16.c"
#include "arm_mat_mult_f16.c"
#include "arm_mat_vec_mult_f16.c"
#include "arm_mat_cmplx_trans_f16.c"
#include "arm_mat_cmplx_mult_f16.c"
#include "arm_mat_inverse_f16.c"
#include "arm_mat_init_f16.c"

@ -0,0 +1,217 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_f16.c
* Description: Floating-point matrix addition
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
arm_status status;
uint32_t numSamples; /* total number of elements in the matrix */
float16_t *pDataA, *pDataB, *pDataDst;
f16x8_t vecA, vecB, vecDst;
float16_t const *pSrcAVec;
float16_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float16_t const *) pDataA;
pSrcBVec = (float16_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
vecDst = vaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vaddq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
return (status);
}
#else
arm_status arm_mat_add_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixAdd group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,930 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_mult_f16.c
* Description: Floating-point matrix multiplication
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Floating-point Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include <stdio.h>
#include "arm_helium_utils.h"
#define DONTCARE 0 /* inactive lane content */
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_2x2_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
const uint16_t MATRIX_DIM = 2;
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0,vecColAOffs0,vecColAOffs1;
float16_t *pInA0 = pInA;
f16x8_t acc0, acc1;
f16x8_t vecB, vecA0, vecA1;
f16x8_t vecTmp;
uint16_t tmp;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2, 3,
MATRIX_DIM * CMPLX_DIM + 2 , MATRIX_DIM * CMPLX_DIM + 3,
};
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
tmp = 0;
vecColAOffs0 = viwdupq_u16(tmp, 4, 1);
tmp = (CMPLX_DIM * MATRIX_DIM);
vecColAOffs1 = vecColAOffs0 + (uint16_t)(CMPLX_DIM * MATRIX_DIM);
pInB = (float16_t const *)pSrcB->pData;
vecA0 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs0);
vecA1 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs1);
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
/*
* Compute
* re0+re1 | im0+im1 | re0+re1 | im0+im1
* re2+re3 | im2+im3 | re2+re3 | im2+im3
*/
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc0);
vecTmp = vaddq(vecTmp, acc0);
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc1);
vecTmp = vaddq(vecTmp, acc1);
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_3x3_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
const uint16_t MATRIX_DIM = 3;
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0;
float16_t *pInA0 = pInA;
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
f16x8_t acc0, acc1, acc2;
f16x8_t vecB, vecA0, vecA1, vecA2;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
DONTCARE, DONTCARE
};
/* enable predication to disable upper half complex vector element */
mve_pred16_t p0 = vctp16q(MATRIX_DIM * CMPLX_DIM);
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
pInB = (float16_t const *)pSrcB->pData;
vecA0 = vldrhq_f16(pInA0);
vecA1 = vldrhq_f16(pInA1);
vecA2 = vldrhq_f16(pInA2);
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_4x4_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
const uint16_t MATRIX_DIM = 4;
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0;
float16_t *pInA0 = pInA;
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM;
f16x8_t acc0, acc1, acc2, acc3;
f16x8_t vecB, vecA;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
3 * MATRIX_DIM * CMPLX_DIM, 3 * MATRIX_DIM * CMPLX_DIM + 1
};
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
pInB = (float16_t const *)pSrcB->pData;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_cmplx_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t const *pInB = (float16_t const *) pSrcB->pData; /* input data matrix pointer B */
float16_t const *pInA = (float16_t const *) pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint16x8_t vecOffs, vecColBOffs;
uint32_t blkCnt,rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* small squared matrix specialized routines
*/
if (numRowsA == numColsB && numColsB == numColsA)
{
if (numRowsA == 1)
{
pOut[0] = pInA[0] * pInB[0] - pInA[1] * pInB[1];
pOut[1] = pInA[0] * pInB[1] + pInA[1] * pInB[0];
return (ARM_MATH_SUCCESS);
}
else if (numRowsA == 2)
return arm_mat_cmplx_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 3)
return arm_mat_cmplx_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_cmplx_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
vecColBOffs[4] = 2*numColsB * CMPLX_DIM;
vecColBOffs[5] = 2*(numColsB * CMPLX_DIM) + 1;
vecColBOffs[6] = 3*numColsB * CMPLX_DIM;
vecColBOffs[7] = 3*(numColsB * CMPLX_DIM) + 1;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float16_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
float16_t const *pInA0 = pInA;
float16_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
float16_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
float16_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
f16x8_t acc0, acc1, acc2, acc3;
acc0 = vdupq_n_f16(0.0f16);
acc1 = vdupq_n_f16(0.0f16);
acc2 = vdupq_n_f16(0.0f16);
acc3 = vdupq_n_f16(0.0f16);
pSrcA0Vec = (float16_t const *) pInA0;
pSrcA1Vec = (float16_t const *) pInA1;
pSrcA2Vec = (float16_t const *) pInA2;
pSrcA3Vec = (float16_t const *) pInA3;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_f16(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs , (uint16_t) (numColsB * 4 * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
blkCnt--;
}
/*
* Unsupported addressing mode compiler crash
*/
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_z_f16(pInB, vecOffs, p0);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
}
mve_cmplx_sum_intra_vec_f16(acc0, &px[0 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc1, &px[1 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc2, &px[2 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc3, &px[3 * CMPLX_DIM * numColsB + 0]);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4) * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float16_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
float16_t const *pSrcA0Vec;
float16_t const *pInA0 = pInA;
f16x8_t acc0;
acc0 = vdupq_n_f16(0.0f16);
pSrcA0Vec = (float16_t const *) pInA0;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (4*numColsB * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecOffs, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
}
mve_cmplx_sum_intra_vec_f16(acc0, &px[0]);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA * CMPLX_DIM;
rowCnt--;
}
/*
* set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
}
/*
* Return to application
*/
return (status);
}
#else
arm_status arm_mat_cmplx_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
float16_t sumReal, sumImag; /* Accumulator */
float16_t a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
float16_t a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f;
sumImag = 0.0f;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop count */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sumReal;
*px++ = sumImag;
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,131 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_f16.c
* Description: Floating-point complex matrix transpose
*
* $Date: 08. July 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_f16(const arm_matrix_instance_f16 * pSrc, arm_matrix_instance_f16 * pDst)
{
return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
px[0] = *pIn++; // real
px[1] = *pIn++; // imag
/* Update the pointer px to point to the next row of the transposed matrix */
px += CMPLX_DIM * nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_f16.c
* Description: Floating-point matrix initialization
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f16(
arm_matrix_instance_f16 * S,
uint16_t nRows,
uint16_t nColumns,
float16_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,904 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_inverse_f16.c
* Description: Floating-point matrix inverse
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
@brief Floating-point matrix inverse.
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_inverse_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t *pTmpA, *pTmpB;
float16_t in = 0.0f; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
uint32_t blkCnt;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/*
* Working pointer for destination matrix
*/
pOutT1 = pOut;
/*
* Loop over the number of rows
*/
rowCnt = numRows;
/*
* Making the destination matrix as identity matrix
*/
while (rowCnt > 0U)
{
/*
* Writing all zeroes in lower triangle of the destination matrix
*/
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/*
* Writing all ones in the diagonal of the destination matrix
*/
*pOutT1++ = 1.0f;
/*
* Writing all zeroes in upper triangle of the destination matrix
*/
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/*
* Decrement the loop counter
*/
rowCnt--;
}
/*
* Loop over the number of columns of the input matrix.
* All the elements in each column are processed by the row operations
*/
loopCnt = numCols;
/*
* Index modifier to navigate through the columns
*/
l = 0U;
while (loopCnt > 0U)
{
/*
* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular.
*/
/*
* Working pointer for the input matrix that points
* * to the pivot element of the particular row
*/
pInT1 = pIn + (l * numCols);
/*
* Working pointer for the destination matrix that points
* * to the pivot element of the particular row
*/
pOutT1 = pOut + (l * numCols);
/*
* Temporary variable to hold the pivot value
*/
in = *pInT1;
/*
* Destination pointer modifier
*/
k = 1U;
/*
* Check if the pivot element is zero
*/
if (*pInT1 == 0.0f)
{
/*
* Loop over the number rows present below
*/
for (i = (l + 1U); i < numRows; i++)
{
/*
* Update the input and destination pointers
*/
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * k);
/*
* Check if there is a non zero pivot element to
* * replace in the rows below
*/
if (*pInT2 != 0.0f)
{
f16x8_t vecA, vecB;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pInT1;
pTmpB = pInT2;
blkCnt = (numCols - l) >> 3;
while (blkCnt > 0U)
{
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_f16(pTmpB, vecA);
vstrhq_f16(pTmpA, vecB);
pTmpA += 8;
pTmpB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_p_f16(pTmpB, vecA, p0);
vstrhq_p_f16(pTmpA, vecB, p0);
}
pInT1 += numCols - l;
pInT2 += numCols - l;
pTmpA = pOutT1;
pTmpB = pOutT2;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_f16(pTmpB, vecA);
vstrhq_f16(pTmpA, vecB);
pTmpA += 8;
pTmpB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_p_f16(pTmpB, vecA, p0);
vstrhq_p_f16(pTmpA, vecB, p0);
}
pOutT1 += numCols;
pOutT2 += numCols;
/*
* Flag to indicate whether exchange is done or not
*/
flag = 1U;
/*
* Break after exchange is done
*/
break;
}
/*
* Update the destination pointer modifier
*/
k++;
}
}
/*
* Update the status if the matrix is singular
*/
if ((flag != 1U) && (in == 0.0f))
{
return ARM_MATH_SINGULAR;
}
/*
* Points to the pivot row of input and destination matrices
*/
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/*
* Temporary pointers to the pivot row pointers
*/
pInT1 = pPivotRowIn;
pOutT1 = pPivotRowDst;
/*
* Pivot element of the row
*/
in = *(pIn + (l * numCols));
pTmpA = pInT1;
f16x8_t invIn = vdupq_n_f16(1.0f / in);
blkCnt = (numCols - l) >> 3;
f16x8_t vecA;
while (blkCnt > 0U)
{
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
pTmpA += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecA = vecA * invIn;
vstrhq_p_f16(pTmpA, vecA, p0);
}
pInT1 += numCols - l;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pOutT1;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
pTmpA += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecA = vecA * invIn;
vstrhq_p_f16(pTmpA, vecA, p0);
}
pOutT1 += numCols;
/*
* Replace the rows with the sum of that row and a multiple of row i
* * so that each new element in column i above row i is zero.
*/
/*
* Temporary pointers for input and destination matrices
*/
pInT1 = pIn;
pOutT1 = pOut;
for (i = 0U; i < numRows; i++)
{
/*
* Check for the pivot element
*/
if (i == l)
{
/*
* If the processing element is the pivot element,
* only the columns to the right are to be processed
*/
pInT1 += numCols - l;
pOutT1 += numCols;
}
else
{
/*
* Element of the reference row
*/
/*
* Working pointers for input and destination pivot rows
*/
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/*
* Loop over the number of columns to the right of the pivot element,
* to replace the elements in the input matrix
*/
in = *pInT1;
f16x8_t tmpV = vdupq_n_f16(in);
blkCnt = (numCols - l) >> 3;
while (blkCnt > 0U)
{
f16x8_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrhq_f16(pInT1);
vec2 = vldrhq_f16(pPRT_in);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_f16(pInT1, vec1);
pPRT_in += 8;
pInT1 += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
f16x8_t vec1, vec2;
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vldrhq_f16(pInT1);
vec2 = vldrhq_f16(pPRT_in);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_p_f16(pInT1, vec1, p0);
pInT1 += blkCnt;
}
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrhq_f16(pOutT1);
vec2 = vldrhq_f16(pPRT_pDst);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_f16(pOutT1, vec1);
pPRT_pDst += 8;
pOutT1 += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
f16x8_t vec1, vec2;
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vldrhq_f16(pOutT1);
vec2 = vldrhq_f16(pPRT_pDst);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_p_f16(pOutT1, vec1, p0);
pInT2 += blkCnt;
pOutT1 += blkCnt;
}
}
/*
* Increment the temporary input pointer
*/
pInT1 = pInT1 + l;
}
/*
* Increment the input pointer
*/
pIn++;
/*
* Decrement the loop counter
*/
loopCnt--;
/*
* Increment the index modifier
*/
l++;
}
/*
* Set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_inverse_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t Xchg, in = 0.0f, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0f)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * k);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0f)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
/* Decrement loop counter */
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l);
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l);
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = in1 - (in * *pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement loop counter */
j--;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement loop counter */
k--;
/* Increment pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,763 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_f16.c
* Description: Floating-point matrix multiplication
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
*/
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
static const uint16_t offsetA[8] = { 0, 0, 2, 2, 0, 0, 2, 2 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 0, 1, 0, 1, 0, 1 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a10 a10 x x x x }
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b00 b01 x x x x }
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01
* a10 b00 a10 b01
* x x
* x x }
*/
vecDst = vmulq(vecInA, vecInB);
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a11 a11 x x x x}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 2);
/*
* load {b10, b11, b10, b11, x x x x }
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11
* a10 b00 + a11 b10 a10 b01 + a11 b11
* x x
* x x }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
mve_pred16_t p0 = vctp16q(2*2);
/*
* Store the result in the destination buffer
* (lower half of the vector)
*/
vstrhq_p(pOut, vecDst, p0);
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
static const uint16_t offsetA[8] = { 0, 0, 0, 3, 3, 3, 6, 6 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 2, 0, 1, 2, 0, 1 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a00 a10 a10 a10 a20 a20}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b02 b00 b01 b02 b00 b01}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01 a00 b02
* a10 b00 a10 b01 a10 b02
* a20 b00 a20 b01}
*/
vecDst = vmulq(vecInA, vecInB);
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a01 a11 a11 a11 a21 a21}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
/*
* load {b10, b11, b12, b10, b11, b12, b10, b11}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12
* a20 b00 + a21 b10 a20 b01 + a21 b11 }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
/*
* move to 3rd column of matrix A
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
/*
* load {a02 a02 a02 a12 a12 a12 a22 a22}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
/*
* load {b20, b21, b22, b20, b21, b22, b20, b21}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* {a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22},
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22},
* a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
/*
* Store the result in the destination buffer
*/
vst1q(pOut, vecDst); pOut += 8;
/* last element computed in scalar mode
* a20 b02 + a21 b12 + a22 b22
*/
_Float16 * pA = (_Float16 *)pSrcA->pData;
_Float16 * pB = (_Float16 *)pSrcB->pData;
*pOut = pA[2*3] * pB[2] + pA[2*3+1] * pB[3+2] + pA[2*3+2] * pB[2*3+2];
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
/* offsetA allows to read and duplicate 2 successive column elements of A */
static const uint16_t offsetA[8] = { 0, 0, 0, 0, 4, 4, 4, 4 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 2, 3, 0, 1, 2, 3 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst0, vecDst1;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a00 a00 a10 a10 a10 a10}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b02 b03 b00 b01 b02 b03}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01 a00 b02 a00 b03
* a10 b00 a10 b01 a10 b02 a10 b03 }
*/
vecDst0 = vmulq(vecInA, vecInB);
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
/*
* load {a20 a20 a20 a20 a30 a30 a30 a30}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* { a20 b00 a20 b01 a20 b02 a20 b03
* a30 b00 a30 b01 a30 b02 + a31 b12 }
*/
vecDst1 = vmulq(vecInA, vecInB);
/*
* rewind back to top half of the A matrix (2nd column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a01 a01 a01 a01 a11 a11 a11 a11}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
/*
* load {b10, b11, b12, b13, b10, b11, b12, b13}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12 a00 b03 + a01 b13
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12 a10 b03 + a11 b13 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
/*
* load {a21 a21 a21 a21 a31 a31 a31 a31}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 + a21 b10 a20 b01 + a21 b11 a20 b02 + a21 b12 a20 b03 + a21 b13
* a30 b00 + a31 b10 a30 b01 + a31 b11 a30 b02 + a31 b12 a30 b03 + a31 b13 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* rewind back to top half of the A matrix (3rd column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a02 a02 a02 a02 a12 a12 a12 a12}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
/*
* load {b20, b21, b22, b23, b20, b21, b22, b23}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22 a00 b03 + a01 b13 + a02 b23
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22 a10 b03 + a11 b13 + a12 b23 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
/*
* load {a22 a22 a22 a22 a32 a32 a32 a32}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 a20 b02 + a21 b12 + a22 b22 a20 b03 + a21 b13 + a22 b23
* a30 b00 + a31 b10 + a32 b20 a30 b01 + a31 b11 + a32 b21 a30 b02 + a31 b12 + a32 b22 a30 b03 + a31 b13 + a32 b23 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* rewind back to top half of the A matrix (4th column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a03 a03 a03 a03 a13 a13 a13 a13}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
/*
* load {b30, b31, b32, b33, b30, b31, b32, b33}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 +...+ a03 b30, a00 b01 +...+ a03 b31, a00 b02 +...+ a03 b32, a00 b03 +...+ a03 b33
* a10 b00 +...+ a13 b30, a10 b01 +...+ a13 b31, a10 b02 +...+ a13 b32, a10 b03 +...+ a13 b33 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
/*
* load {a23 a23 a23 a23 a33 a33 a33 a33}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 +...+ a23 b30, a20 b01 +...+ a23 b31, a20 b02 +...+ a23 b32, a20 b03 +...+ a23 b33
* a30 b00 +...+ a33 b30, a30 b01 +...+ a33 b31, a30 b02 +...+ a33 b32, a30 b03 +...+ a33 b33 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* Store the result in the destination buffer
*/
vst1q(pOut, vecDst0); pOut += 8;
vst1q(pOut, vecDst1);
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t blkCnt; /* loop counters */
int i;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if(numRowsA == 2)
return arm_mat_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 4)
return arm_mat_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
}
/* main loop process 4 rows */
i = numRowsA / 4;
while(i > 0)
{
float16_t *pInA0, *pInA1, *pInA2, *pInA3;
float16_t *pInB0;
float16_t *pOut0, *pOut1, *pOut2, *pOut3;
f16x8_t vecMac0, vecMac1, vecMac2, vecMac3;
f16x8_t vecInB;
/* pointers to 4 consecutive output rows */
pOut0 = pOut;
pOut1 = pOut0 + numColsB;
pOut2 = pOut1 + numColsB;
pOut3 = pOut2 + numColsB;
pInB0 = pInB;
int k = numColsB >> 3;
while(k > 0)
{
/* pointers to 4 consecutive Matrix A rows */
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
vecMac0 = vdupq_n_f16(0.0f16);
vecMac1 = vdupq_n_f16(0.0f16);
vecMac2 = vdupq_n_f16(0.0f16);
vecMac3 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3..., bi,4n+7}
*/
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x 8 block) in the destination buffer */
vst1q(pOut0, vecMac0); pOut0 += 8;
vst1q(pOut1, vecMac1); pOut1 += 8;
vst1q(pOut2, vecMac2); pOut2 += 8;
vst1q(pOut3, vecMac3); pOut3 += 8;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 8;
k--;
}
int colBLeft = numColsB & 7;
if (colBLeft)
{
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
mve_pred16_t p0 = vctp16q(colBLeft);
vecMac0 = vdupq_n_f16(0.0f16);
vecMac1 = vdupq_n_f16(0.0f16);
vecMac2 = vdupq_n_f16(0.0f16);
vecMac3 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..bi,4n+colBLeft-1, 0, ..}
*/
vecInB = vldrhq_z_f16(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x colBLeft block) in the destination buffer */
vstrhq_p_f16(pOut0, vecMac0, p0);
vstrhq_p_f16(pOut1, vecMac1, p0);
vstrhq_p_f16(pOut2, vecMac2, p0);
vstrhq_p_f16(pOut3, vecMac3, p0);
}
pInA += 4 * numColsA;
pOut += 4 * numColsB;
i--;
}
/*
* non multiple of 4 rows for Matrix A
* process single row
*/
if (numRowsA & 3)
{
i = numRowsA & 3;
do
{
float16_t *pInA0;
float16_t *pInB0;
float16_t *pOut0;
f16x8_t vecInB;
f16x8_t vecMac0;
pOut0 = pOut;
pInB0 = pInB;
int k = numColsB >> 3;
while(k > 0)
{
pInA0 = pInA;
vecMac0 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3, ...bi,4n+7}
*/
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x 8 block) in the destination buffer */
vst1q(pOut0, vecMac0); pOut0 += 8;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 8;
k--;
}
int colBLeft = numColsB & 7;
if (colBLeft)
{
pInA0 = pInA;
mve_pred16_t p0 = vctp16q(colBLeft);
vecMac0 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..., bi,4n+colBLeft, 0, ...}
*/
vecInB = vldrhq_z_f16(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x colBLeft block) in the destination buffer */
vstrhq_p_f16(pOut0, vecMac0, p0);
}
pInA += 1 * numColsA;
pOut += 1 * numColsB;
}
while (--i);
}
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
}
#else
arm_status arm_mat_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
float16_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,206 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_f16.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_f16(
const arm_matrix_instance_f16 * pSrc,
float16_t scale,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
f16x8_t vecIn, vecOut;
float16_t const *pInVec;
pInVec = (float16_t const *) pIn;
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 8;
vecOut = vmulq(vecIn, scale);
vst1q(pOut, vecOut);
pOut += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vld1q(pInVec);
vecOut = vecIn * scale;
vstrhq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f16(
const arm_matrix_instance_f16 * pSrc,
float16_t scale,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* Input data matrix pointer */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixScale group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,215 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f16.c
* Description: Floating-point matrix subtraction
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix subtraction */
uint32_t numSamples; /* total number of elements in the matrix */
float16_t *pDataA, *pDataB, *pDataDst;
f16x8_t vecA, vecB, vecDst;
float16_t const *pSrcAVec;
float16_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float16_t const *) pDataA;
pSrcBVec = (float16_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
vecDst = vsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vsubq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixSub group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -0,0 +1,202 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_trans_f16.c
* Description: Floating-point matrix transpose
*
* $Date: 18. March 2020
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
if (pDst->numRows == pDst->numCols)
{
if (pDst->numCols == 1)
{
pDst->pData[0] = pSrc->pData[0];
return(ARM_MATH_SUCCESS);
}
if (pDst->numCols == 2)
return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 3)
return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
if (pDst->numCols == 4)
return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
}
arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint32_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -48,123 +48,8 @@
#if defined(ARM_MATH_MVEI)
__STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
{
pDataDest[0] = pDataSrc[0];
pDataDest[3] = pDataSrc[3];
pDataDest[2] = pDataSrc[1];
pDataDest[1] = pDataSrc[2];
return (ARM_MATH_SUCCESS);
}
static arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
uint16x8_t vecOffs1;
uint16x8_t vecIn1;
/*
*
* | 0 1 2 | | 0 3 6 | 8 x 16 flattened version | 0 3 6 1 4 7 2 5 |
* | 3 4 5 | => | 1 4 7 | => | 8 . . . . . . . |
* | 6 7 8 | | 2 5 8 | (row major)
*
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
pDataDest[8] = pDataSrc[8];
return (ARM_MATH_SUCCESS);
}
static arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
{
static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
uint16x8_t vecOffs1, vecOffs2;
uint16x8_t vecIn1, vecIn2;
uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
/*
* 4x4 Matrix transposition
*
* | 0 1 2 3 | | 0 4 8 12 | 8 x 16 flattened version
* | 4 5 6 7 | => | 1 5 9 13 | => [0 4 8 12 1 5 9 13]
* | 8 9 10 11 | | 2 6 10 14 | [2 6 10 14 3 7 11 15]
* | 12 13 14 15 | | 3 7 11 15 |
*/
vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
vecIn1 = vldrhq_u16(pDataSrcVec);
pDataSrcVec += 8;
vecIn2 = vldrhq_u16(pDataSrcVec);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
return (ARM_MATH_SUCCESS);
}
#include "arm_helium_utils.h"
static arm_status arm_mat_trans_16bit_generic(
uint16_t srcRows,
uint16_t srcCols,
uint16_t * pDataSrc,
uint16_t * pDataDest)
{
uint16x8_t vecOffs;
uint32_t i;
uint32_t blkCnt;
uint16_t const *pDataC;
uint16_t *pDataDestR;
uint16x8_t vecIn;
vecOffs = vidupq_u16((uint32_t)0, 1);
vecOffs = vecOffs * srcCols;
i = srcCols;
while(i > 0U)
{
pDataC = (uint16_t const *) pDataSrc;
pDataDestR = pDataDest;
blkCnt = srcRows >> 3;
while (blkCnt > 0U)
{
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_u16(pDataDestR, vecIn);
pDataDestR += 8;
pDataC = pDataC + srcCols * 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = srcRows & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
vstrhq_p_u16(pDataDestR, vecIn, p0);
}
pDataSrc += 1;
pDataDest += srcRows;
i--;
}
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_trans_q15(

@ -0,0 +1,394 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_vec_mult_f16.c
* Description: Floating-point matrix and vector multiplication
*
* $Date: 07. July 202
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixVectMult
* @{
*/
/**
* @brief Floating-point matrix and vector multiplication.
* @param[in] *pSrcMat points to the input matrix structure
* @param[in] *pVec points to input vector
* @param[out] *pDst points to output vector
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mat_vec_mult_f16(
const arm_matrix_instance_f16 *pSrcMat,
const float16_t *pSrcVec,
float16_t *pDstVec)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const float16_t *pSrcA = pSrcMat->pData;
const float16_t *pInA0;
const float16_t *pInA1;
float16_t *px;
int32_t row;
uint32_t blkCnt; /* loop counters */
row = numRows;
px = pDstVec;
/*
* compute 4 rows in parallel
*/
while (row >= 4)
{
const float16_t *pInA2, *pInA3;
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
f16x8_t vecIn, acc0, acc1, acc2, acc3;
float16_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to 4 consecutive MatrixA rows
*/
pInA0 = pSrcA;
pInA1 = pInA0 + numCols;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f16(0.0f);
acc1 = vdupq_n_f16(0.0f);
acc2 = vdupq_n_f16(0.0f);
acc3 = vdupq_n_f16(0.0f);
pSrcA0Vec = pInA0;
pSrcA1Vec = pInA1;
pSrcA2Vec = pInA2;
pSrcA3Vec = pInA3;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vecA;
vecIn = vld1q(pInVec);
pInVec += 8;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
pSrcA1Vec += 8;
acc1 = vfmaq(acc1, vecIn, vecA);
vecA = vld1q(pSrcA2Vec);
pSrcA2Vec += 8;
acc2 = vfmaq(acc2, vecIn, vecA);
vecA = vld1q(pSrcA3Vec);
pSrcA3Vec += 8;
acc3 = vfmaq(acc3, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecA;
vecIn = vldrhq_z_f16(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
acc1 = vfmaq(acc1, vecIn, vecA);
vecA = vld1q(pSrcA2Vec);
acc2 = vfmaq(acc2, vecIn, vecA);
vecA = vld1q(pSrcA3Vec);
acc3 = vfmaq(acc3, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF16Mve(acc0);
*px++ = vecAddAcrossF16Mve(acc1);
*px++ = vecAddAcrossF16Mve(acc2);
*px++ = vecAddAcrossF16Mve(acc3);
pSrcA += numCols * 4;
/*
* Decrement the row loop counter
*/
row -= 4;
}
/*
* compute 2 rows in parrallel
*/
if (row >= 2)
{
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
f16x8_t vecIn, acc0, acc1;
float16_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to 2 consecutive MatrixA rows
*/
pInA0 = pSrcA;
pInA1 = pInA0 + numCols;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f16(0.0f);
acc1 = vdupq_n_f16(0.0f);
pSrcA0Vec = pInA0;
pSrcA1Vec = pInA1;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vecA;
vecIn = vld1q(pInVec);
pInVec += 8;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
pSrcA1Vec += 8;
acc1 = vfmaq(acc1, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecA;
vecIn = vldrhq_z_f16(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
vecA = vld1q(pSrcA1Vec);
acc1 = vfmaq(acc1, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF16Mve(acc0);
*px++ = vecAddAcrossF16Mve(acc1);
pSrcA += numCols * 2;
row -= 2;
}
if (row >= 1)
{
f16x8_t vecIn, acc0;
float16_t const *pSrcA0Vec, *pInVec;
float16_t const *pSrcVecPtr = pSrcVec;
/*
* Initialize the pointers to last MatrixA row
*/
pInA0 = pSrcA;
/*
* Initialize the vector pointer
*/
pInVec = pSrcVecPtr;
/*
* reset accumulators
*/
acc0 = vdupq_n_f16(0.0f);
pSrcA0Vec = pInA0;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vecA;
vecIn = vld1q(pInVec);
pInVec += 8;
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vfmaq(acc0, vecIn, vecA);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecA;
vecIn = vldrhq_z_f16(pInVec, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vfmaq(acc0, vecIn, vecA);
}
/*
* Sum the partial parts
*/
*px++ = vecAddAcrossF16Mve(acc0);
}
}
#else
void arm_mat_vec_mult_f16(const arm_matrix_instance_f16 *pSrcMat, const float16_t *pVec, float16_t *pDst)
{
uint32_t numRows = pSrcMat->numRows;
uint32_t numCols = pSrcMat->numCols;
const float16_t *pSrcA = pSrcMat->pData;
const float16_t *pInA1; /* input data matrix pointer A of Q31 type */
const float16_t *pInA2; /* input data matrix pointer A of Q31 type */
const float16_t *pInA3; /* input data matrix pointer A of Q31 type */
const float16_t *pInA4; /* input data matrix pointer A of Q31 type */
const float16_t *pInVec; /* input data matrix pointer B of Q31 type */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t i, row, colCnt; /* loop counters */
float16_t matData, matData2, vecData, vecData2;
/* Process 4 rows at a time */
row = numRows >> 2;
i = 0u;
px = pDst;
/* The following loop performs the dot-product of each row in pSrcA with the vector */
/* row loop */
while (row > 0) {
/* For every row wise process, the pInVec pointer is set
** to the starting address of the vector */
pInVec = pVec;
/* Initialize accumulators */
float16_t sum1 = 0.0f;
float16_t sum2 = 0.0f;
float16_t sum3 = 0.0f;
float16_t sum4 = 0.0f;
/* Loop unrolling: process 2 columns per iteration */
colCnt = numCols;
/* Initialize pointers to the starting address of the column being processed */
pInA1 = pSrcA + i;
pInA2 = pInA1 + numCols;
pInA3 = pInA2 + numCols;
pInA4 = pInA3 + numCols;
// Main loop: matrix-vector multiplication
while (colCnt > 0u) {
// Read 2 values from vector
vecData = *(pInVec)++;
// Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
matData = *(pInA1)++;
sum1 += matData * vecData;
matData = *(pInA2)++;
sum2 += matData * vecData;
matData = *(pInA3)++;
sum3 += matData * vecData;
matData = *(pInA4)++;
sum4 += matData * vecData;
// Decrement the loop counter
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum1;
*px++ = sum2;
*px++ = sum3;
*px++ = sum4;
i = i + numCols * 4;
/* Decrement the row loop counter */
row--;
}
/* process any remaining rows */
row = numRows & 3u;
while (row > 0) {
float16_t sum = 0.0f;
pInVec = pVec;
pInA1 = pSrcA + i;
colCnt = numCols >> 1;
while (colCnt > 0) {
vecData = *(pInVec)++;
vecData2 = *(pInVec)++;
matData = *(pInA1)++;
matData2 = *(pInA1)++;
sum += matData * vecData;
sum += matData2 * vecData2;
colCnt--;
}
// process remainder of row
colCnt = numCols & 1u;
while (colCnt > 0) {
sum += *pInA1++ * *pInVec++;
colCnt--;
}
*px++ = sum;
i = i + numCols;
row--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -37,31 +37,38 @@
#include "arm_cfft_init_q15.c"
#include "arm_cfft_init_q31.c"
#include "arm_cfft_radix2_f32.c"
#include "arm_cfft_radix2_init_f32.c"
#include "arm_cfft_radix2_init_q15.c"
#include "arm_cfft_radix2_init_q31.c"
#include "arm_cfft_radix2_q15.c"
#include "arm_cfft_radix2_q31.c"
#include "arm_cfft_radix4_f32.c"
#include "arm_cfft_radix4_init_f32.c"
#include "arm_cfft_radix4_init_q15.c"
#include "arm_cfft_radix4_init_q31.c"
#include "arm_cfft_radix4_q15.c"
#include "arm_cfft_radix4_q31.c"
#include "arm_cfft_radix8_f32.c"
#include "arm_rfft_fast_f32.c"
#include "arm_rfft_fast_f64.c"
#include "arm_rfft_fast_init_f32.c"
#include "arm_rfft_fast_init_f64.c"
/* Deprecated */
#include "arm_dct4_f32.c"
#include "arm_dct4_init_f32.c"
#include "arm_dct4_init_q15.c"
#include "arm_dct4_init_q31.c"
#include "arm_dct4_q15.c"
#include "arm_dct4_q31.c"
#include "arm_rfft_f32.c"
#include "arm_rfft_fast_f32.c"
#include "arm_rfft_fast_f64.c"
#include "arm_rfft_fast_init_f32.c"
#include "arm_rfft_fast_init_f64.c"
#include "arm_rfft_q15.c"
#include "arm_rfft_q31.c"
#include "arm_rfft_init_f32.c"
#include "arm_rfft_init_q15.c"
#include "arm_rfft_init_q31.c"
#include "arm_rfft_q15.c"
#include "arm_rfft_q31.c"
#include "arm_cfft_radix4_init_f32.c"
#include "arm_cfft_radix4_init_q15.c"
#include "arm_cfft_radix4_init_q31.c"
#include "arm_cfft_radix2_init_f32.c"
#include "arm_cfft_radix2_init_q15.c"
#include "arm_cfft_radix2_init_q31.c"

@ -331,6 +331,8 @@ set(TESTSRC16
Source/Tests/FIRF16.cpp
Source/Tests/BIQUADF16.cpp
Source/Tests/MISCF16.cpp
Source/Tests/BinaryTestsF16.cpp
Source/Tests/UnaryTestsF16.cpp
Source/Tests/TransformCF16.cpp
Source/Tests/TransformRF16.cpp
)

@ -0,0 +1,32 @@
#include "Test.h"
#include "Pattern.h"
#include "dsp/matrix_functions_f16.h"
class BinaryTestsF16:public Client::Suite
{
public:
BinaryTestsF16(Testing::testID_t id);
virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
private:
#include "BinaryTestsF16_decl.h"
Client::Pattern<float16_t> input1;
Client::Pattern<float16_t> input2;
Client::Pattern<float16_t> ref;
Client::Pattern<int16_t> dims;
Client::LocalPattern<float16_t> output;
/* Local copies of inputs since matrix instance in CMSIS-DSP are not using
pointers to const.
*/
Client::LocalPattern<float16_t> a;
Client::LocalPattern<float16_t> b;
int nbr;
int nbc;
arm_matrix_instance_f16 in1;
arm_matrix_instance_f16 in2;
arm_matrix_instance_f16 out;
};

@ -0,0 +1,32 @@
#include "Test.h"
#include "Pattern.h"
#include "dsp/matrix_functions_f16.h"
class UnaryTestsF16:public Client::Suite
{
public:
UnaryTestsF16(Testing::testID_t id);
virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
private:
#include "UnaryTestsF16_decl.h"
Client::Pattern<float16_t> input1;
Client::Pattern<float16_t> input2;
Client::Pattern<float16_t> ref;
Client::Pattern<int16_t> dims;
Client::LocalPattern<float16_t> output;
/* Local copies of inputs since matrix instance in CMSIS-DSP are not using
pointers to const.
*/
Client::LocalPattern<float16_t> a;
Client::LocalPattern<float16_t> b;
int nbr;
int nbc;
arm_matrix_instance_f16 in1;
arm_matrix_instance_f16 in2;
arm_matrix_instance_f16 out;
};

@ -689,7 +689,12 @@ def writeUnaryTests(config,format):
# Current algo is not very accurate for big matrix.
# But big matrix required to check the vectorized code.
dims=[1,2,3,4,7,8,9,15,16,17,32,33]
if format==Tools.F16:
# Size limited for f16 because accuracy is
# not good at all for bigger matrices
dims=[1,2,3,4,7,8,9,15,16]
else:
dims=[1,2,3,4,7,8,9,15,16,17,32,33]
vals = []
inp=[]
@ -716,6 +721,7 @@ def generatePatterns():
PARAMBINDIR = os.path.join("Parameters","DSP","Matrix","Binary","Binary")
configBinaryf32=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"f32")
configBinaryf16=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"f16")
configBinaryq31=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"q31")
configBinaryq15=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"q15")
configBinaryq7=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"q7")
@ -723,6 +729,7 @@ def generatePatterns():
writeBinaryTests(configBinaryf32,Tools.F32)
writeBinaryTests(configBinaryf16,Tools.F16)
writeBinaryTests(configBinaryq31,Tools.Q31)
writeBinaryTests(configBinaryq15,Tools.Q15)
writeBinaryTests(configBinaryq7,Tools.Q7)
@ -732,6 +739,7 @@ def generatePatterns():
configUnaryf64=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"f64")
configUnaryf32=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"f32")
configUnaryf16=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"f16")
configUnaryq31=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"q31")
configUnaryq15=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"q15")
configUnaryq7=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"q7")
@ -739,6 +747,8 @@ def generatePatterns():
writeUnaryTests(configUnaryf64,Tools.F64)
writeUnaryTests(configUnaryf32,Tools.F32)
writeUnaryTests(configUnaryf16,Tools.F16)
writeUnaryTests(configUnaryq31,Tools.Q31)
writeUnaryTests(configUnaryq31,Tools.Q31)
writeUnaryTests(configUnaryq15,Tools.Q15)
writeUnaryTests(configUnaryq7,Tools.Q7)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,22 @@
H
10
// 1
0x0001
// 2
0x0002
// 3
0x0003
// 4
0x0004
// 7
0x0007
// 8
0x0008
// 9
0x0009
// 15
0x000F
// 16
0x0010
// 2
0x0002

@ -0,0 +1,198 @@
H
98
// 1
0x0001
// 1
0x0001
// 1
0x0001
// 2
0x0002
// 1
0x0001
// 3
0x0003
// 1
0x0001
// 4
0x0004
// 1
0x0001
// 7
0x0007
// 1
0x0001
// 16
0x0010
// 1
0x0001
// 23
0x0017
// 2
0x0002
// 1
0x0001
// 2
0x0002
// 2
0x0002
// 2
0x0002
// 3
0x0003
// 2
0x0002
// 4
0x0004
// 2
0x0002
// 7
0x0007
// 2
0x0002
// 16
0x0010
// 2
0x0002
// 23
0x0017
// 3
0x0003
// 1
0x0001
// 3
0x0003
// 2
0x0002
// 3
0x0003
// 3
0x0003
// 3
0x0003
// 4
0x0004
// 3
0x0003
// 7
0x0007
// 3
0x0003
// 16
0x0010
// 3
0x0003
// 23
0x0017
// 4
0x0004
// 1
0x0001
// 4
0x0004
// 2
0x0002
// 4
0x0004
// 3
0x0003
// 4
0x0004
// 4
0x0004
// 4
0x0004
// 7
0x0007
// 4
0x0004
// 16
0x0010
// 4
0x0004
// 23
0x0017
// 7
0x0007
// 1
0x0001
// 7
0x0007
// 2
0x0002
// 7
0x0007
// 3
0x0003
// 7
0x0007
// 4
0x0004
// 7
0x0007
// 7
0x0007
// 7
0x0007
// 16
0x0010
// 7
0x0007
// 23
0x0017
// 16
0x0010
// 1
0x0001
// 16
0x0010
// 2
0x0002
// 16
0x0010
// 3
0x0003
// 16
0x0010
// 4
0x0004
// 16
0x0010
// 7
0x0007
// 16
0x0010
// 16
0x0010
// 16
0x0010
// 23
0x0017
// 23
0x0017
// 1
0x0001
// 23
0x0017
// 2
0x0002
// 23
0x0017
// 3
0x0003
// 23
0x0017
// 4
0x0004
// 23
0x0017
// 7
0x0007
// 23
0x0017
// 16
0x0010
// 23
0x0017
// 23
0x0017

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,96 @@
H
47
// 1.000000
0x3c00
// -0.013554
0xa2f1
// -0.516418
0xb822
// 0.099856
0x2e64
// -0.063094
0xac0a
// 0.235283
0x3387
// 0.058222
0x2b74
// -0.778461
0xba3a
// -0.142255
0xb08d
// -0.098023
0xae46
// 0.251210
0x3405
// -0.101114
0xae79
// 0.148685
0x30c2
// -0.064970
0xac28
// 0.080360
0x2d25
// 0.510886
0x3816
// -0.317613
0xb515
// 0.317399
0x3514
// -0.015243
0xa3ce
// -0.029123
0xa775
// 0.284566
0x348e
// -0.134889
0xb051
// -0.064918
0xac28
// 0.183938
0x31e3
// -0.253136
0xb40d
// 0.267614
0x3448
// 0.170277
0x3173
// 0.530148
0x383e
// -0.340637
0xb573
// 0.875454
0x3b01
// -0.357078
0xb5b7
// 0.489558
0x37d5
// 0.039769
0x2917
// -0.692500
0xb98a
// 0.142047
0x308c
// 0.235809
0x338c
// 0.207209
0x32a1
// 0.331737
0x354f
// -0.032288
0xa822
// -0.126846
0xb00f
// -0.718843
0xb9c0
// 0.294611
0x34b7
// 0.361941
0x35cb
// 0.066997
0x2c4a
// -0.021069
0xa565
// -0.464241
0xb76e
// 0.007340
0x1f84

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,786 @@
H
392
// 0.065177
0x2c2c
// 0.073375
0x2cb2
// 0.075378
0x2cd3
// 0.091117
0x2dd5
// -0.040496
0xa92f
// 0.032118
0x281c
// 0.110453
0x2f12
// 0.065177
0x2c2c
// -0.604829
0xb8d7
// 0.073375
0x2cb2
// -0.006015
0x9e29
// 0.075378
0x2cd3
// 0.391532
0x3644
// 0.091117
0x2dd5
// 0.129558
0x3025
// -0.040496
0xa92f
// -0.285559
0xb492
// 0.032118
0x281c
// -0.478016
0xb7a6
// 0.110453
0x2f12
// 0.171094
0x317a
// 0.065177
0x2c2c
// -0.604829
0xb8d7
// -0.003879
0x9bf2
// 0.073375
0x2cb2
// -0.006015
0x9e29
// -0.201411
0xb272
// 0.075378
0x2cd3
// 0.391532
0x3644
// -0.602685
0xb8d2
// 0.091117
0x2dd5
// 0.129558
0x3025
// 0.043250
0x2989
// -0.040496
0xa92f
// -0.285559
0xb492
// -0.214891
0xb2e0
// 0.032118
0x281c
// -0.478016
0xb7a6
// 0.914179
0x3b50
// 0.110453
0x2f12
// 0.171094
0x317a
// -0.096979
0xae35
// 0.065177
0x2c2c
// -0.604829
0xb8d7
// -0.003879
0x9bf2
// 0.157615
0x310b
// 0.073375
0x2cb2
// -0.006015
0x9e29
// -0.201411
0xb272
// -0.674383
0xb965
// 0.075378
0x2cd3
// 0.391532
0x3644
// -0.602685
0xb8d2
// 0.241720
0x33bc
// 0.091117
0x2dd5
// 0.129558
0x3025
// 0.043250
0x2989
// -0.003618
0x9b69
// -0.040496
0xa92f
// -0.285559
0xb492
// -0.214891
0xb2e0
// -0.037703
0xa8d3
// 0.032118
0x281c
// -0.478016
0xb7a6
// 0.914179
0x3b50
// -0.400779
0xb66a
// 0.110453
0x2f12
// 0.171094
0x317a
// -0.096979
0xae35
// 0.228860
0x3353
// 0.065177
0x2c2c
// -0.604829
0xb8d7
// -0.003879
0x9bf2
// 0.157615
0x310b
// -0.207477
0xb2a4
// -0.447515
0xb729
// -0.676908
0xb96a
// 0.073375
0x2cb2
// -0.006015
0x9e29
// -0.201411
0xb272
// -0.674383
0xb965
// -0.140275
0xb07d
// -0.400734
0xb669
// -0.148252
0xb0be
// 0.075378
0x2cd3
// 0.391532
0x3644
// -0.602685
0xb8d2
// 0.241720
0x33bc
// -0.013486
0xa2e8
// 0.071506
0x2c94
// -0.421687
0xb6bf
// 0.091117
0x2dd5
// 0.129558
0x3025
// 0.043250
0x2989
// -0.003618
0x9b69
// 0.032091
0x281c
// -0.026998
0xa6e9
// 0.198279
0x3258
// -0.040496
0xa92f
// -0.285559
0xb492
// -0.214891
0xb2e0
// -0.037703
0xa8d3
// -0.455534
0xb74a
// -0.394148
0xb64e
// 0.121496
0x2fc7
// 0.032118
0x281c
// -0.478016
0xb7a6
// 0.914179
0x3b50
// -0.400779
0xb66a
// 0.071360
0x2c91
// 0.421843
0x36c0
// 0.675201
0x3967
// 0.110453
0x2f12
// 0.171094
0x317a
// -0.096979
0xae35
// 0.228860
0x3353
// -0.030445
0xa7cb
// -0.052441
0xaab6
// -0.262839
0xb435
// 0.065177
0x2c2c
// -0.604829
0xb8d7
// -0.003879
0x9bf2
// 0.157615
0x310b
// -0.207477
0xb2a4
// -0.447515
0xb729
// -0.676908
0xb96a
// -0.186246
0xb1f6
// -0.138838
0xb071
// 0.105986
0x2ec8
// -0.404153
0xb677
// -0.252231
0xb409
// -0.145413
0xb0a7
// 0.209444
0x32b4
// -0.260962
0xb42d
// 0.098829
0x2e53
// 0.073375
0x2cb2
// -0.006015
0x9e29
// -0.201411
0xb272
// -0.674383
0xb965
// -0.140275
0xb07d
// -0.400734
0xb669
// -0.148252
0xb0be
// -0.262301
0xb432
// -0.163921
0xb13f
// -0.432204
0xb6ea
// -0.019905
0xa518
// 0.081151
0x2d32
// 0.453349
0x3741
// 0.588064
0x38b4
// -0.420738
0xb6bb
// 0.103660
0x2ea2
// 0.075378
0x2cd3
// 0.391532
0x3644
// -0.602685
0xb8d2
// 0.241720
0x33bc
// -0.013486
0xa2e8
// 0.071506
0x2c94
// -0.421687
0xb6bf
// -0.228767
0xb352
// 0.145997
0x30ac
// 0.676217
0x3969
// -0.097938
0xae45
// 0.477692
0x37a5
// -0.176282
0xb1a4
// -0.409477
0xb68d
// 0.034974
0x287a
// -0.103844
0xaea5
// 0.091117
0x2dd5
// 0.129558
0x3025
// 0.043250
0x2989
// -0.003618
0x9b69
// 0.032091
0x281c
// -0.026998
0xa6e9
// 0.198279
0x3258
// -0.505518
0xb80b
// 0.455219
0x3749
// -0.204145
0xb288
// 0.230497
0x3360
// -0.055522
0xab1b
// -0.576398
0xb89c
// 0.330282
0x3549
// -0.262504
0xb433
// 0.487315
0x37cc
// -0.040496
0xa92f
// -0.285559
0xb492
// -0.214891
0xb2e0
// -0.037703
0xa8d3
// -0.455534
0xb74a
// -0.394148
0xb64e
// 0.121496
0x2fc7
// 0.384751
0x3628
// -0.289029
0xb4a0
// -0.389529
0xb63c
// -0.421876
0xb6c0
// 0.290180
0x34a5
// 0.185524
0x31f0
// 0.789968
0x3a52
// -0.148520
0xb0c1
// 0.123930
0x2fee
// 0.032118
0x281c
// -0.478016
0xb7a6
// 0.914179
0x3b50
// -0.400779
0xb66a
// 0.071360
0x2c91
// 0.421843
0x36c0
// 0.675201
0x3967
// 0.011714
0x21ff
// -0.307025
0xb4ea
// 0.725770
0x39ce
// -0.346869
0xb58d
// 0.300160
0x34cd
// -0.156293
0xb100
// -0.434623
0xb6f4
// 0.765252
0x3a1f
// 0.247563
0x33ec
// 0.110453
0x2f12
// 0.171094
0x317a
// -0.096979
0xae35
// 0.228860
0x3353
// -0.030445
0xa7cb
// -0.052441
0xaab6
// -0.262839
0xb435
// 0.294711
0x34b7
// -0.363569
0xb5d1
// 0.107500
0x2ee1
// 0.506859
0x380e
// 0.054092
0x2aec
// 0.599363
0x38cb
// -0.368742
0xb5e6
// -0.781094
0xba40
// -0.065162
0xac2c
// 0.065177
0x2c2c
// -0.604829
0xb8d7
// -0.003879
0x9bf2
// 0.157615
0x310b
// -0.207477
0xb2a4
// -0.447515
0xb729
// -0.676908
0xb96a
// -0.186246
0xb1f6
// -0.138838
0xb071
// 0.105986
0x2ec8
// -0.404153
0xb677
// -0.252231
0xb409
// -0.145413
0xb0a7
// 0.209444
0x32b4
// -0.260962
0xb42d
// 0.098829
0x2e53
// -0.163146
0xb138
// 0.057192
0x2b52
// -0.436173
0xb6fb
// -0.292779
0xb4af
// -0.020365
0xa537
// -0.033979
0xa859
// 0.086233
0x2d85
// 0.073375
0x2cb2
// -0.006015
0x9e29
// -0.201411
0xb272
// -0.674383
0xb965
// -0.140275
0xb07d
// -0.400734
0xb669
// -0.148252
0xb0be
// -0.262301
0xb432
// -0.163921
0xb13f
// -0.432204
0xb6ea
// -0.019905
0xa518
// 0.081151
0x2d32
// 0.453349
0x3741
// 0.588064
0x38b4
// -0.420738
0xb6bb
// 0.103660
0x2ea2
// 0.385484
0x362b
// -0.174786
0xb198
// -0.116991
0xaf7d
// 0.118594
0x2f97
// 0.258199
0x3422
// -0.082567
0xad49
// -0.228460
0xb350
// 0.075378
0x2cd3
// 0.391532
0x3644
// -0.602685
0xb8d2
// 0.241720
0x33bc
// -0.013486
0xa2e8
// 0.071506
0x2c94
// -0.421687
0xb6bf
// -0.228767
0xb352
// 0.145997
0x30ac
// 0.676217
0x3969
// -0.097938
0xae45
// 0.477692
0x37a5
// -0.176282
0xb1a4
// -0.409477
0xb68d
// 0.034974
0x287a
// -0.103844
0xaea5
// -0.499133
0xb7fc
// -0.829460
0xbaa3
// -0.182757
0xb1d9
// -0.506883
0xb80e
// 0.522298
0x382e
// -0.522191
0xb82d
// 0.065033
0x2c29
// 0.091117
0x2dd5
// 0.129558
0x3025
// 0.043250
0x2989
// -0.003618
0x9b69
// 0.032091
0x281c
// -0.026998
0xa6e9
// 0.198279
0x3258
// -0.505518
0xb80b
// 0.455219
0x3749
// -0.204145
0xb288
// 0.230497
0x3360
// -0.055522
0xab1b
// -0.576398
0xb89c
// 0.330282
0x3549
// -0.262504
0xb433
// 0.487315
0x37cc
// -0.240474
0xb3b2
// 0.345253
0x3586
// 0.156623
0x3103
// 0.317572
0x3515
// 0.295017
0x34b8
// 0.130245
0x302b
// -0.217621
0xb2f7
// -0.040496
0xa92f
// -0.285559
0xb492
// -0.214891
0xb2e0
// -0.037703
0xa8d3
// -0.455534
0xb74a
// -0.394148
0xb64e
// 0.121496
0x2fc7
// 0.384751
0x3628
// -0.289029
0xb4a0
// -0.389529
0xb63c
// -0.421876
0xb6c0
// 0.290180
0x34a5
// 0.185524
0x31f0
// 0.789968
0x3a52
// -0.148520
0xb0c1
// 0.123930
0x2fee
// 0.168120
0x3161
// -0.101402
0xae7d
// -0.358987
0xb5be
// -0.749984
0xba00
// -0.323443
0xb52d
// 0.084039
0x2d61
// -0.622083
0xb8fa
// 0.032118
0x281c
// -0.478016
0xb7a6
// 0.914179
0x3b50
// -0.400779
0xb66a
// 0.071360
0x2c91
// 0.421843
0x36c0
// 0.675201
0x3967
// 0.011714
0x21ff
// -0.307025
0xb4ea
// 0.725770
0x39ce
// -0.346869
0xb58d
// 0.300160
0x34cd
// -0.156293
0xb100
// -0.434623
0xb6f4
// 0.765252
0x3a1f
// 0.247563
0x33ec
// -0.072272
0xaca0
// 0.362537
0x35cd
// -0.229903
0xb35b
// 0.161554
0x312b
// 0.311479
0x34fc
// -0.004970
0x9d17
// 0.198138
0x3257
// 0.110453
0x2f12
// 0.171094
0x317a
// -0.096979
0xae35
// 0.228860
0x3353
// -0.030445
0xa7cb
// -0.052441
0xaab6
// -0.262839
0xb435
// 0.294711
0x34b7
// -0.363569
0xb5d1
// 0.107500
0x2ee1
// 0.506859
0x380e
// 0.054092
0x2aec
// 0.599363
0x38cb
// -0.368742
0xb5e6
// -0.781094
0xba40
// -0.065162
0xac2c
// 0.031448
0x2806
// 0.088655
0x2dad
// -0.035384
0xa887
// -1.267741
0xbd12
// 0.424117
0x36c9
// 0.327717
0x353e
// 0.660219
0x3948

@ -0,0 +1,152 @@
#include "BinaryTestsF16.h"
#include <stdio.h>
#include "Error.h"
#define SNR_THRESHOLD 60
/*
Reference patterns are generated with
a double precision computation.
*/
#define REL_ERROR (2.0e-3)
#define ABS_ERROR (2.0e-3)
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
#define LOADDATA2() \
const float16_t *inp1=input1.ptr(); \
const float16_t *inp2=input2.ptr(); \
\
float16_t *ap=a.ptr(); \
float16_t *bp=b.ptr(); \
\
float16_t *outp=output.ptr(); \
int16_t *dimsp = dims.ptr(); \
int nbMatrixes = dims.nbSamples() / 3;\
int rows,internal,columns; \
int i;
#define PREPAREDATA2() \
in1.numRows=rows; \
in1.numCols=internal; \
memcpy((void*)ap,(const void*)inp1,2*sizeof(float16_t)*rows*internal);\
in1.pData = ap; \
\
in2.numRows=internal; \
in2.numCols=columns; \
memcpy((void*)bp,(const void*)inp2,2*sizeof(float16_t)*internal*columns);\
in2.pData = bp; \
\
out.numRows=rows; \
out.numCols=columns; \
out.pData = outp;
void BinaryTestsF16::test_mat_mult_f16()
{
LOADDATA2();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
internal = *dimsp++;
columns = *dimsp++;
PREPAREDATA2();
arm_mat_mult_f16(&this->in1,&this->in2,&this->out);
outp += (rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void BinaryTestsF16::test_mat_cmplx_mult_f16()
{
LOADDATA2();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
internal = *dimsp++;
columns = *dimsp++;
PREPAREDATA2();
arm_mat_cmplx_mult_f16(&this->in1,&this->in2,&this->out);
outp += (2*rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void BinaryTestsF16::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
{
(void)params;
switch(id)
{
case TEST_MAT_MULT_F16_1:
input1.reload(BinaryTestsF16::INPUTS1_F16_ID,mgr);
input2.reload(BinaryTestsF16::INPUTS2_F16_ID,mgr);
dims.reload(BinaryTestsF16::DIMSBINARY1_S16_ID,mgr);
ref.reload(BinaryTestsF16::REFMUL1_F16_ID,mgr);
output.create(ref.nbSamples(),BinaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPA_F16_ID,mgr);
b.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPB_F16_ID,mgr);
break;
case TEST_MAT_CMPLX_MULT_F16_2:
input1.reload(BinaryTestsF16::INPUTSC1_F16_ID,mgr);
input2.reload(BinaryTestsF16::INPUTSC2_F16_ID,mgr);
dims.reload(BinaryTestsF16::DIMSBINARY1_S16_ID,mgr);
ref.reload(BinaryTestsF16::REFCMPLXMUL1_F16_ID,mgr);
output.create(ref.nbSamples(),BinaryTestsF16::OUT_F16_ID,mgr);
a.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPA_F16_ID,mgr);
b.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsF16::TMPB_F16_ID,mgr);
break;
}
}
void BinaryTestsF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
{
(void)id;
output.dump(mgr);
}

@ -0,0 +1,408 @@
#include "UnaryTestsF16.h"
#include <stdio.h>
#include "Error.h"
#define SNR_THRESHOLD 60
/*
Reference patterns are generated with
a double precision computation.
*/
#define REL_ERROR (1.1e-3)
#define ABS_ERROR (1.1e-3)
/*
Comparisons for inverse
*/
/* Not very accurate for big matrix.
But big matrix needed for checking the vectorized code */
#define SNR_THRESHOLD_INV 45
#define REL_ERROR_INV (3.0e-2)
#define ABS_ERROR_INV (3.0e-2)
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
#define LOADDATA2() \
const float16_t *inp1=input1.ptr(); \
const float16_t *inp2=input2.ptr(); \
\
float16_t *ap=a.ptr(); \
float16_t *bp=b.ptr(); \
\
float16_t *outp=output.ptr(); \
int16_t *dimsp = dims.ptr(); \
int nbMatrixes = dims.nbSamples() >> 1;\
int rows,columns; \
int i;
#define LOADDATA1() \
const float16_t *inp1=input1.ptr(); \
\
float16_t *ap=a.ptr(); \
\
float16_t *outp=output.ptr(); \
int16_t *dimsp = dims.ptr(); \
int nbMatrixes = dims.nbSamples() >> 1;\
int rows,columns; \
int i;
#define PREPAREDATA2() \
in1.numRows=rows; \
in1.numCols=columns; \
memcpy((void*)ap,(const void*)inp1,sizeof(float16_t)*rows*columns);\
in1.pData = ap; \
\
in2.numRows=rows; \
in2.numCols=columns; \
memcpy((void*)bp,(const void*)inp2,sizeof(float16_t)*rows*columns);\
in2.pData = bp; \
\
out.numRows=rows; \
out.numCols=columns; \
out.pData = outp;
#define PREPAREDATA1(TRANSPOSED) \
in1.numRows=rows; \
in1.numCols=columns; \
memcpy((void*)ap,(const void*)inp1,sizeof(float16_t)*rows*columns);\
in1.pData = ap; \
\
if (TRANSPOSED) \
{ \
out.numRows=columns; \
out.numCols=rows; \
} \
else \
{ \
out.numRows=rows; \
out.numCols=columns; \
} \
out.pData = outp;
#define PREPAREDATA1C(TRANSPOSED) \
in1.numRows=rows; \
in1.numCols=columns; \
memcpy((void*)ap,(const void*)inp1,2*sizeof(float16_t)*rows*columns);\
in1.pData = ap; \
\
if (TRANSPOSED) \
{ \
out.numRows=columns; \
out.numCols=rows; \
} \
else \
{ \
out.numRows=rows; \
out.numCols=columns; \
} \
out.pData = outp;
#define LOADVECDATA2() \
const float16_t *inp1=input1.ptr(); \
const float16_t *inp2=input2.ptr(); \
\
float16_t *ap=a.ptr(); \
float16_t *bp=b.ptr(); \
\
float16_t *outp=output.ptr(); \
int16_t *dimsp = dims.ptr(); \
int nbMatrixes = dims.nbSamples() / 2;\
int rows,internal; \
int i;
#define PREPAREVECDATA2() \
in1.numRows=rows; \
in1.numCols=internal; \
memcpy((void*)ap,(const void*)inp1,2*sizeof(float16_t)*rows*internal);\
in1.pData = ap; \
\
memcpy((void*)bp,(const void*)inp2,2*sizeof(float16_t)*internal);
void UnaryTestsF16::test_mat_vec_mult_f16()
{
LOADVECDATA2();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
internal = *dimsp++;
PREPAREVECDATA2();
arm_mat_vec_mult_f16(&this->in1, bp, outp);
outp += rows ;
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void UnaryTestsF16::test_mat_add_f16()
{
LOADDATA2();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
columns = *dimsp++;
PREPAREDATA2();
arm_mat_add_f16(&this->in1,&this->in2,&this->out);
outp += (rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void UnaryTestsF16::test_mat_sub_f16()
{
LOADDATA2();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
columns = *dimsp++;
PREPAREDATA2();
arm_mat_sub_f16(&this->in1,&this->in2,&this->out);
outp += (rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void UnaryTestsF16::test_mat_scale_f16()
{
LOADDATA1();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
columns = *dimsp++;
PREPAREDATA1(false);
arm_mat_scale_f16(&this->in1,0.5f,&this->out);
outp += (rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void UnaryTestsF16::test_mat_trans_f16()
{
LOADDATA1();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
columns = *dimsp++;
PREPAREDATA1(true);
arm_mat_trans_f16(&this->in1,&this->out);
outp += (rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void UnaryTestsF16::test_mat_cmplx_trans_f16()
{
LOADDATA1();
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
columns = *dimsp++;
PREPAREDATA1C(true);
arm_mat_cmplx_trans_f16(&this->in1,&this->out);
outp += 2*(rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR,REL_ERROR);
}
void UnaryTestsF16::test_mat_inverse_f16()
{
const float16_t *inp1=input1.ptr();
float16_t *ap=a.ptr();
float16_t *outp=output.ptr();
int16_t *dimsp = dims.ptr();
int nbMatrixes = dims.nbSamples();
int rows,columns;
int i;
arm_status status;
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
columns = rows;
PREPAREDATA1(false);
status=arm_mat_inverse_f16(&this->in1,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
inp1 += (rows * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD_INV);
ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR_INV,REL_ERROR_INV);
}
void UnaryTestsF16::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
{
(void)params;
switch(id)
{
case TEST_MAT_ADD_F16_1:
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
input2.reload(UnaryTestsF16::INPUTS2_F16_ID,mgr);
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
ref.reload(UnaryTestsF16::REFADD1_F16_ID,mgr);
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
b.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPB_F16_ID,mgr);
break;
case TEST_MAT_SUB_F16_2:
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
input2.reload(UnaryTestsF16::INPUTS2_F16_ID,mgr);
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
ref.reload(UnaryTestsF16::REFSUB1_F16_ID,mgr);
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
b.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPB_F16_ID,mgr);
break;
case TEST_MAT_SCALE_F16_3:
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
ref.reload(UnaryTestsF16::REFSCALE1_F16_ID,mgr);
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
break;
case TEST_MAT_TRANS_F16_4:
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
ref.reload(UnaryTestsF16::REFTRANS1_F16_ID,mgr);
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
break;
case TEST_MAT_INVERSE_F16_5:
input1.reload(UnaryTestsF16::INPUTSINV_F16_ID,mgr);
dims.reload(UnaryTestsF16::DIMSINVERT1_S16_ID,mgr);
ref.reload(UnaryTestsF16::REFINV1_F16_ID,mgr);
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
break;
case TEST_MAT_VEC_MULT_F16_6:
input1.reload(UnaryTestsF16::INPUTS1_F16_ID,mgr);
input2.reload(UnaryTestsF16::INPUTVEC1_F16_ID,mgr);
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
ref.reload(UnaryTestsF16::REFVECMUL1_F16_ID,mgr);
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
b.create(MAXMATRIXDIM,UnaryTestsF16::TMPB_F16_ID,mgr);
break;
case TEST_MAT_CMPLX_TRANS_F16_7:
input1.reload(UnaryTestsF16::INPUTSC1_F16_ID,mgr);
dims.reload(UnaryTestsF16::DIMSUNARY1_S16_ID,mgr);
ref.reload(UnaryTestsF16::REFTRANSC1_F16_ID,mgr);
output.create(ref.nbSamples(),UnaryTestsF16::OUT_F16_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,UnaryTestsF16::TMPA_F16_ID,mgr);
break;
}
}
void UnaryTestsF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
{
(void)id;
output.dump(mgr);
}

@ -344,6 +344,88 @@ group Root {
}
group Matrix Tests {
class = MatrixTests
folder = Matrix
group Unary Tests {
class = UnaryTests
folder = Unary
suite Unary Tests F16 {
class = UnaryTestsF16
folder = UnaryF16
Pattern INPUTS1_F16_ID : InputA1_f16.txt
Pattern INPUTSC1_F16_ID : InputAC1_f16.txt
Pattern INPUTS2_F16_ID : InputB1_f16.txt
Pattern INPUTVEC1_F16_ID : InputVec1_f16.txt
Pattern INPUTSINV_F16_ID : InputInvert1_f16.txt
Pattern DIMSUNARY1_S16_ID : DimsUnary1_s16.txt
Pattern DIMSINVERT1_S16_ID : DimsInvert1_s16.txt
Pattern REFADD1_F16_ID : RefAdd1_f16.txt
Pattern REFSUB1_F16_ID : RefSub1_f16.txt
Pattern REFSCALE1_F16_ID : RefScale1_f16.txt
Pattern REFTRANS1_F16_ID : RefTranspose1_f16.txt
Pattern REFTRANSC1_F16_ID : RefTransposeC1_f16.txt
Pattern REFINV1_F16_ID : RefInvert1_f16.txt
Pattern REFVECMUL1_F16_ID : RefVecMul1_f16.txt
Output OUT_F16_ID : Output
Output TMPA_F16_ID : TmpA
Output TMPB_F16_ID : TmpB
Functions {
test matrix add:test_mat_add_f16
test matrix sub:test_mat_sub_f16
test matrix scale:test_mat_scale_f16
test matrix transpose:test_mat_trans_f16
test matrix inverse:test_mat_inverse_f16
test mat mult vec:test_mat_vec_mult_f16
test matrix complex transpose:test_mat_cmplx_trans_f16
}
}
}
group Binary Tests{
class = BinaryTests
folder = Binary
suite Binary F16 {
class = BinaryTestsF16
folder = BinaryF16
Pattern INPUTS1_F16_ID : InputA1_f16.txt
Pattern INPUTS2_F16_ID : InputB1_f16.txt
Pattern INPUTSC1_F16_ID : InputAC1_f16.txt
Pattern INPUTSC2_F16_ID : InputBC1_f16.txt
Pattern REFMUL1_F16_ID : RefMul1_f16.txt
Pattern REFCMPLXMUL1_F16_ID : RefCmplxMul1_f16.txt
Pattern DIMSBINARY1_S16_ID : DimsBinary1_s16.txt
Output OUT_F16_ID : Output
Output TMPA_F16_ID : TmpA
Output TMPB_F16_ID : TmpB
Functions {
test mult:test_mat_mult_f16
test complex mult:test_mat_cmplx_mult_f16
}
}
}
}
group Transform Tests {
class = TransformTests
folder = Transform

Loading…
Cancel
Save