/* A utility function to override Intel MKL (Math Kernel Library)'s internal processor dispatch settings/parameters at run-time. This has been tested with (64-bit) version 10.2.2, 10.2.5, 10.3 To use, call MKLParamOverride() in your program. Then at run-time set the environmental variables MKL_xxxx, where xxxx is the name of an MKL internal parameter and can be: disable_fast_mm __MKL_CPU_MicroArchitecture itisBarcelona mkl_cpu_type __HT __N_Logical_Cores __N_Physical_Cores __N_CPU_Packages __N_Cores_per_Packages Levels_of_cache L1_cache_size L2_cache_size L3_cache_size By default, MKLParamOverride() will print the default value of these internal parameters. My experiments show that only mkl_cpu_type, __N_Physical_Cores, and L1_cache_size can affect the run-time behavior of MKL (in particular, BLAS function DGEMM on large problem sizes.) The meanings of these internal parameters are: disable_fast_mm: This indicates whether the fast memory management is on or not. One should use the environmental variable MKL_DISABLE_FAST_MM instead. The value of this parameter is set only when MKL allocates certain sizes of memory chunks in certain BLAS functions (e.g. dgemm) __MKL_CPU_MicroArchitecture: 32: Merom 33: Penryn 64: Nehalem 66: Westmere 128: Sandy Bridge 0: Everything else (including AMD Barcelona) The (undocumented) environmental variable MKL_DEBUG_CPU_MA can "somewhat" set the value of this internal parameter. "Somewhat" means MKL_DEBUG_CPU_MA is used only when the "mkl_cpu_type" internal parameter (see below) is set for SSE 4.2 instruction set. itisBarcelona: When __MKL_CPU_MicroArchitecture is 0, setting this parameter to 1 to indicate the CPU microarchitecture is AMD Barcelona (K10) mkl_cpu_type: SSE instruction set level. For 64-bit: 0: Default (SSE) 1: SSE 2 2: Supplemental SSE 3 (SSSE) 3: SSE 4.2 4: AVX For 32-bit: 0: Default (SSE) 1: SSE 2 2: SSE 3 3: Supplemental SSE 3 (SSSE) 4: SSE 4.2 5: AVX The (undocumented) environmental variable MKL_DEBUG_CPU_TYPE can also be used to set the value of this parameter. >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> NOTE: This parameter has the biggest impact on the performance. >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> __HT: Intel Hyper-Threading technology is present. __N_Logical_Cores: __N_Physical_Cores: __N_CPU_Packages: __N_Cores_per_Packages: Processor/core topology. NOTE: Setting the value of __N_Physical_Cores has the same effect as setting the OpenMP environmental variable OMP_NUM_THREADS or MKL's environmental variable MKL_NUM_THREADS Levels_of_cache: L1_cache_size: L2_cache_size: L3_cache_size: Cache hierarchy and cache sizes. ************************************************************************** How does MKLParamOverride() obtain the addresses of MKL's internal parameters ? Since these internal parameters are defined at file/function scope (i.e. with "static" keyword), they are invisible to the outside world. Unlike Java, C does not have the "reflection" API, so the only way to know their addresses is through the "nm" utility. If your program is linked statically (and unstripped), then "nm" can tell you the addresses. If dynamic linking is in use, then we need to look at the run-time process memory map (/proc/self/maps) and determine the Base address (of the MKL core library, which defines these internal parameters), use "nm" to get their addresses as before, and adjust the addresses by the Base address. */ #define _GNU_SOURCE #include #include #include #include #include #include static int MKLFindParamAddress( const char **paramName, int **paramAddress, int siz, FILE *f ) { char *tmp, *tmp2; int i; size_t s; int matches = 0; assert( NULL != f ); assert( NULL != paramName ); assert( NULL != paramAddress ); assert( 0 < siz ); tmp = alloca( 1024 ); assert( NULL != tmp ); tmp2 = NULL; assert( sizeof( void* ) == sizeof( unsigned long int ) ); while ( 0 < getline( &tmp2, &s, f ) ) { unsigned long int address; char type; if ( 3 == sscanf( tmp2, "%lx %c %1023s\n", &address, &type, tmp ) ) { for ( i = 0; i < siz; ++i ) { if ( 0 == strcmp( tmp, paramName[i] ) ) { paramAddress[i] = ( int* )address; ++matches; /* printf( "%s is at %p\n", paramName[i], paramAddress[i] ); */ break; } } } free( tmp2 ); tmp2 = NULL; } fclose( f ); return matches; } void MKLParamOverride() { const char *paramName[] = {"MKL_cache_sizes", "disable_fast_mm", "__MKL_CPU_MicroArchitecture", "itisBarcelona", "mkl_cpu_type", "__HT", "__N_Logical_Cores", "__N_Physical_Cores", "__N_CPU_Packages", "__N_Cores_per_Packages"}; const int paramNum = sizeof( paramName ) / sizeof( paramName[0] ); static int *paramAddress[sizeof( paramName ) / sizeof( paramName[0] )] = {NULL}; const char *cacheParamName[] = {"Levels_of_cache", "L1_cache_size", "L2_cache_size", "L3_cache_size"}; const int cacheParamNum = sizeof( cacheParamName ) / sizeof( cacheParamName[0] ); static int *cacheParamAddress[sizeof( cacheParamName ) / sizeof( cacheParamName[0] )] = {NULL}; static int paramAddressInited = 0; int i; size_t s; char *tmp, *tmp2, *egrepQuery, *t, *t2, *t3; char egrepQuery2[] = "fgrep '/libmkl_core.so'"; FILE *f; doMKLParamOverride: if ( 1 == paramAddressInited ) { /* call a series of MKL internal functions to initialize the internal parameters (except "disable_fast_mm", which is set when mkl_serv_allocate() is called.) */ mkl_serv_get_ht(); mkl_serv_get_ncpus(); mkl_serv_get_ncorespercpu(); mkl_serv_mkl_domain_get_max_threads(); mkl_serv_cpu_detect(); mkl_serv_mkl_enable_instructions(); mkl_serv_get_microarchitecture(); mkl_serv_cpuisitbarcelona(); mkl_serv_get_cache_size(); fprintf( stderr, "%s: Current MKL internal parameters are:\n", __FUNCTION__ ); for ( i = 1; i < paramNum; ++i ) { if ( NULL != paramAddress[i] ) printf( " %s:\t%d\n", paramName[i], *paramAddress[i] ); } if ( NULL != paramAddress[0] ) { cacheParamAddress[0] = paramAddress[0]; for ( i = 0; i < cacheParamNum; ++i ) { if ( 0 == i ) cacheParamAddress[0] = paramAddress[0]; else cacheParamAddress[i] = 1 + cacheParamAddress[i - 1]; printf( " %s:\t%d\n", cacheParamName[i], *cacheParamAddress[i] ); } } for ( i = 1; i < paramNum; ++i ) { sprintf( tmp, "MKL_%s", paramName[i] ); t = getenv( tmp ); if ( t ) { int j; if ( sscanf( t, "%d", &j ) && NULL != paramAddress[i] ) { *paramAddress[i] = j; printf( "%s: %s is changed to %d because of env variable %s\n", __FUNCTION__, paramName[i], *paramAddress[i], tmp ); } } } if ( NULL != cacheParamAddress[0] ) { for ( i = 0; i < cacheParamNum; ++i ) { sprintf( tmp, "MKL_%s", cacheParamName[i] ); t = getenv( tmp ); if ( t ) { int j; if ( sscanf( t, "%d", &j ) ) { *cacheParamAddress[i] = j; printf( "%s: %s is changed to %d because of env variable %s\n", __FUNCTION__, cacheParamName[i], *cacheParamAddress[i], tmp ); } } } } return; } if ( 2 == paramAddressInited ) { fprintf( stderr, "%s: cannot override MKL internal parameters due to earlier error(s)\n", __FUNCTION__ ); return; } /* construct the expression for egrep query */ egrepQuery = alloca( 256 ); assert( NULL != egrepQuery ); strcpy( egrepQuery, "egrep '(" ); for ( i = 0; i < paramNum - 1; ++i ) { strcat( egrepQuery, paramName[i] ); strcat( egrepQuery, "|" ); } strcat( egrepQuery, paramName[i] ); strcat( egrepQuery, ")'" ); /* first, assume we are statically linked (and not stripped) */ tmp = alloca( 1024 ); assert( NULL != tmp ); if ( 0 >= readlink( "/proc/self/exe", tmp, 1024 ) ) { fprintf( stderr, "%s: readlink on /proc/self/exe returns error\n", __FUNCTION__ ); paramAddressInited = 2; return; } tmp2 = NULL; asprintf( &tmp2, "nm -n %s | %s", tmp, egrepQuery ); assert( NULL != tmp2 ); f = popen( tmp2, "r" ); free( tmp2 ); i = MKLFindParamAddress( paramName, paramAddress, paramNum, f ); if ( i > paramNum / 2 ) { paramAddressInited = 1; goto doMKLParamOverride; } /* second, assume we are dynamically linked, and we need to examine /proc/self/maps */ tmp2 = NULL; asprintf( &tmp2, "%s /proc/%d/maps", egrepQuery2, getpid() ); assert( NULL != tmp2 ); f = popen( tmp2, "r" ); free( tmp2 ); tmp2 = NULL; if ( 0 >= getline( &tmp2, &s, f ) ) { fprintf( stderr, "%s: cannot find MKL in /proc/self/maps\n", __FUNCTION__ ); paramAddressInited = 2; return; } fclose( f ); /* read the first line, which contains the start address and full pathname of MKL core library */ t = strtok( tmp2, " \r\n\t" ); /* t should hold the start/end address of MKL core library */ assert( NULL != t ); strcpy( tmp, t ); /* reuse tmp: concat to tmp */ t = tmp; while ( NULL != ( t2 = strtok( NULL, " \r\n\t" ) ) ) t3 = t2; /* t3 should hold the full pathname of MKL core library */ t3 = strcpy( t + strlen( t ) + 2, t3 ); /* more reuse of tmp: concat to tmp */ free( tmp2 ); tmp2 = NULL; asprintf( &tmp2, "nm -n %s | %s", t3, egrepQuery ); assert( NULL != tmp2 ); f = popen( tmp2, "r" ); free( tmp2 ); i = MKLFindParamAddress( paramName, paramAddress, paramNum, f ); if ( i < paramNum / 2 ) { paramAddressInited = 2; fprintf( stderr, "%s: cannot find addresses of MKL internal parameters in %s\n", __FUNCTION__, t3 ); return; } { unsigned long int baseAddress; if ( 1 == sscanf( t, "%lx", &baseAddress ) ) { /* adjust the addresses of MKL internal parameters by baseAddress */ for ( i = 0; i < paramNum; ++i ) { if ( NULL != paramAddress[i] ) paramAddress[i] = ( int* )( baseAddress + ( typeof( baseAddress ) )paramAddress[i] ); } paramAddressInited = 1; goto doMKLParamOverride; } } paramAddressInited = 2; }