Skip to content

crash in zesDevicePciGetProperties() #900

@bgoglin

Description

@bgoglin

Hello
hwloc crashes in recent releases of level-zero and compute-runtime. I don't know exactly when it started because it's complicated to test different releases when Debian/Ubuntu official ZE packages (mostly level-zero package names) are incompatible with the deb packages you distribute.

Anyway, here's a small reproducer extracted from hwloc. It basically just list ZES drivers and devices and calls zesDevicePciGetProperties() on each:

#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <assert.h>

#include <level_zero/ze_api.h>
#include <level_zero/zes_api.h>

int main(void)
{
  zes_driver_handle_t *sdrh;
  uint32_t nbdrivers, i, k;
  ze_result_t res;

  printf("testing ZES devices\n");

  res = zesInit(0);
  if (res != ZE_RESULT_SUCCESS) {
    fprintf(stderr, "Failed to initialize LevelZero Sysman in zesInit(): %d\n", (int)res);
    return 0;
  }

  nbdrivers = 0;
  res = zesDriverGet(&nbdrivers, NULL);
  if (res != ZE_RESULT_SUCCESS || !nbdrivers)
    return 0;
  sdrh = malloc(nbdrivers * sizeof(*sdrh));
  if (!sdrh)
    return 0;
  res = zesDriverGet(&nbdrivers, sdrh);
  if (res != ZE_RESULT_SUCCESS) {
    free(sdrh);
    return 0;
  }

  printf("found %u L0 ZES drivers\n", nbdrivers);

  k = 0;
  for(i=0; i<nbdrivers; i++) {
    uint32_t nbdevices, j;
    zes_device_handle_t *sdvh;

    nbdevices = 0;
    res = zesDeviceGet(sdrh[i], &nbdevices, NULL);
    if (res != ZE_RESULT_SUCCESS || !nbdevices)
      continue;
    sdvh = malloc(nbdevices * sizeof(*sdvh));
    if (!sdvh)
      continue;
    res = zesDeviceGet(sdrh[i], &nbdevices, sdvh);
    if (res != ZE_RESULT_SUCCESS) {
      free(sdvh);
      continue;
    }

    printf("found %u L0 ZES devices in driver #%u\n", nbdevices, i);

    for (j=0; j<nbdevices; j++, k++) {
      zes_pci_properties_t pci;

      res = zesDevicePciGetProperties(sdvh[j], &pci);
      if (res != ZE_RESULT_SUCCESS) {
        errno = EINVAL;
        return -1;
      }
      printf("got PCI\n");
    }
    free(sdvh);
  }
  free(sdrh);

  return 0;
}

gdb shows:

testing ZES devices
⚠️ warning: Corrupted shared library list: 0x555557ab6ca0 != 0x555555576310
found 1 L0 ZES drivers
found 1 L0 ZES devices in driver #0

Program received signal SIGSEGV, Segmentation fault.
L0::Sysman::PciImp::pciStaticProperties (this=0x555557ab75b0, 
    pProperties=0x7fffffffe2b0)
    at ../../neo/level_zero/sysman/source/api/pci/sysman_pci_imp.cpp:126
⚠️ warning: 126	../../neo/level_zero/sysman/source/api/pci/sysman_pci_imp.cpp: Aucun fichier ou dossier de ce nom

The machine is a laptop with 13th Gen Intel(R) Core(TM) i7-1370P running 6.18.12+deb14-amd64
I removed all relevant packages and installed your last ones (+ dbgsym ones):

intel-igc-core-2_2.28.4+20760_amd64.deb
intel-ocloc_26.05.37020.3-0_amd64.deb
level-zero-devel_1.27.0+u24.04_amd64.deb
libigdgmm12_22.9.0_amd64.deb
intel-igc-opencl-2_2.28.4+20760_amd64.deb
intel-opencl-icd_26.05.37020.3-0_amd64.deb
level-zero_1.27.0+u24.04_amd64.deb
libze-intel-gpu1_26.05.37020.3-0_amd64.deb

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions