diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java index 46c6abbd18a574..189080d2f8e09a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/Diagnoser.java @@ -26,6 +26,7 @@ import org.apache.doris.catalog.Tablet; import org.apache.doris.catalog.TabletInvertedIndex; import org.apache.doris.catalog.TabletMeta; +import org.apache.doris.common.Config; import com.google.common.collect.Lists; import org.json.simple.JSONObject; @@ -114,6 +115,7 @@ public static List> diagnoseTablet(long tabletId) { StringBuilder versionErr = new StringBuilder(); StringBuilder statusErr = new StringBuilder(); StringBuilder compactionErr = new StringBuilder(); + boolean isCloudMode = Config.isCloudMode(); // for local mode, getCachedVisibleVersion return visibleVersion. // for cloud mode, the replica version is not updated. long visibleVersion = partition.getCachedVisibleVersion(); @@ -143,20 +145,22 @@ public static List> diagnoseTablet(long tabletId) { + replica.getBackendIdWithoutException() + " is not query available. "); break; } - if (be.diskExceedLimit()) { + if (!isCloudMode && be.diskExceedLimit()) { backendErr.append("Backend " + replica.getBackendIdWithoutException() + " has no space left. "); break; } } while (false); // version - if (replica.getVersion() != visibleVersion) { - versionErr.append("Replica on backend " + replica.getBackendIdWithoutException() + "'s version (" - + replica.getVersion() + ") does not equal" - + " to partition visible version (" + visibleVersion + ")"); - } else if (replica.getLastFailedVersion() != -1) { - versionErr.append("Replica on backend " - + replica.getBackendIdWithoutException() + "'s last failed version is " - + replica.getLastFailedVersion()); + if (!isCloudMode) { + if (replica.getVersion() != visibleVersion) { + versionErr.append("Replica on backend " + replica.getBackendIdWithoutException() + "'s version (" + + replica.getVersion() + ") does not equal" + + " to partition visible version (" + visibleVersion + ")"); + } else if (replica.getLastFailedVersion() != -1) { + versionErr.append("Replica on backend " + + replica.getBackendIdWithoutException() + "'s last failed version is " + + replica.getLastFailedVersion()); + } } // status if (!replica.isAlive() || replica.isUserDrop()) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java index dde4be35ef63f6..d0efb6dffd2c9e 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletReplicaTooSlowTest.java @@ -17,9 +17,14 @@ package org.apache.doris.clone; +import org.apache.doris.catalog.Database; import org.apache.doris.catalog.DiskInfo; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.MaterializedIndex; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.Replica; +import org.apache.doris.catalog.Tablet; import org.apache.doris.catalog.TabletInvertedIndex; import org.apache.doris.common.Config; import org.apache.doris.common.ExceptionChecker; @@ -162,6 +167,98 @@ private static void updateReplicaVersionCount() { Assert.assertTrue(result.get(11).get(1).contains("version count is too high")); } + private static String getDiagnosisInfo(List> rows, String item) { + for (List row : rows) { + if (item.equals(row.get(0))) { + return row.get(1); + } + } + return ""; + } + + private static Map copyBackendDisks(Backend backend) { + Map disks = Maps.newHashMap(); + for (DiskInfo diskInfo : backend.getDisks().values()) { + TDisk tDisk = new TDisk(); + tDisk.setRootPath(diskInfo.getRootPath()); + tDisk.setDiskTotalCapacity(diskInfo.getTotalCapacityB()); + tDisk.setDataUsedCapacity(diskInfo.getDataUsedCapacityB()); + tDisk.setTrashUsedCapacity(diskInfo.getTrashUsedCapacityB()); + tDisk.setDiskAvailableCapacity(diskInfo.getAvailableCapacityB()); + tDisk.setUsed(diskInfo.getState() == DiskInfo.DiskState.ONLINE); + tDisk.setPathHash(diskInfo.getPathHash()); + tDisk.setStorageMedium(diskInfo.getStorageMedium()); + disks.put(tDisk.getRootPath(), tDisk); + } + return disks; + } + + private static Map buildExceedLimitDisks(Backend backend) { + Map disks = Maps.newHashMap(); + for (DiskInfo diskInfo : backend.getDisks().values()) { + TDisk tDisk = new TDisk(); + tDisk.setRootPath(diskInfo.getRootPath()); + tDisk.setDiskTotalCapacity(1L); + tDisk.setDataUsedCapacity(1L); + tDisk.setTrashUsedCapacity(0L); + tDisk.setDiskAvailableCapacity(0L); + tDisk.setUsed(true); + tDisk.setPathHash(diskInfo.getPathHash()); + tDisk.setStorageMedium(diskInfo.getStorageMedium()); + disks.put(tDisk.getRootPath(), tDisk); + } + return disks; + } + + @Test + public void testDiagnoseTabletCloudModeSkipDiskAndVersionCheck() throws Exception { + String tableName = "tbl_diag_cloud_" + Math.abs(random.nextInt()); + String createStr = "create table test." + tableName + "\n" + + "(k1 date, k2 int)\n" + + "distributed by hash(k2) buckets 1\n" + + "properties\n" + + "(\n" + + " \"replication_num\" = \"3\"\n" + + ")"; + ExceptionChecker.expectThrowsNoException(() -> createTable(createStr)); + + Database db = Env.getCurrentInternalCatalog().getDbNullable("test"); + Assert.assertNotNull(db); + OlapTable table = (OlapTable) db.getTableNullable(tableName); + Assert.assertNotNull(table); + Partition partition = table.getAllPartitions().iterator().next(); + MaterializedIndex index = partition.getBaseIndex(); + Tablet tablet = index.getTablets().get(0); + Replica replica = tablet.getReplicas().get(0); + long tabletId = tablet.getId(); + long visibleVersion = partition.getCachedVisibleVersion(); + Backend backend = Env.getCurrentSystemInfo().getBackend(replica.getBackendIdWithoutException()); + Assert.assertNotNull(backend); + + Map originalDisks = copyBackendDisks(backend); + String originCloudUniqueId = Config.cloud_unique_id; + long originalVersion = replica.getVersion(); + + try { + backend.updateDisks(buildExceedLimitDisks(backend)); + long mismatchVersion = visibleVersion == Long.MAX_VALUE ? visibleVersion - 1 : visibleVersion + 1; + replica.adminUpdateVersionInfo(mismatchVersion, null, null, System.currentTimeMillis()); + + List> localResult = Diagnoser.diagnoseTablet(tabletId); + Assert.assertTrue(getDiagnosisInfo(localResult, "ReplicaBackendStatus").contains("has no space left")); + Assert.assertTrue(getDiagnosisInfo(localResult, "ReplicaVersionStatus").contains("does not equal")); + + Config.cloud_unique_id = "diagnose-tablet-cloud-mode-ut"; + List> cloudResult = Diagnoser.diagnoseTablet(tabletId); + Assert.assertEquals("OK", getDiagnosisInfo(cloudResult, "ReplicaBackendStatus")); + Assert.assertEquals("OK", getDiagnosisInfo(cloudResult, "ReplicaVersionStatus")); + } finally { + Config.cloud_unique_id = originCloudUniqueId; + backend.updateDisks(originalDisks); + replica.adminUpdateVersionInfo(originalVersion, null, null, System.currentTimeMillis()); + } + } + @Test public void test() throws Exception { // test colocate tablet repair