diff --git a/go/BUILD.bazel b/go/BUILD.bazel index db719d88fb91..3588f81195e1 100644 --- a/go/BUILD.bazel +++ b/go/BUILD.bazel @@ -56,9 +56,19 @@ codeql_pkg_files( prefix = "tools/{CODEQL_PLATFORM}", ) +codeql_pkg_files( + name = "canonicalize-dll", + srcs = select({ + "@platforms//os:windows": ["//shared/canonicalize:pkg"], + "//conditions:default": [], + }), + prefix = "tools/{CODEQL_PLATFORM}", +) + codeql_pack( name = "go", srcs = [ + ":canonicalize-dll", ":extractor-pack-arch", ":resources", "//go/codeql-tools", diff --git a/go/extractor/BUILD.bazel b/go/extractor/BUILD.bazel index 23158e25b15f..3a303868ae5f 100644 --- a/go/extractor/BUILD.bazel +++ b/go/extractor/BUILD.bazel @@ -16,6 +16,7 @@ go_library( importpath = "github.com/github/codeql-go/extractor", visibility = ["//visibility:public"], deps = [ + "//go/extractor/canonicalize", "//go/extractor/dbscheme", "//go/extractor/diagnostics", "//go/extractor/srcarchive", diff --git a/go/extractor/canonicalize/BUILD.bazel b/go/extractor/canonicalize/BUILD.bazel new file mode 100644 index 000000000000..02ee70ea6ac4 --- /dev/null +++ b/go/extractor/canonicalize/BUILD.bazel @@ -0,0 +1,11 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "canonicalize", + srcs = [ + "canonicalize_other.go", + "canonicalize_windows.go", + ], + importpath = "github.com/github/codeql-go/extractor/canonicalize", + visibility = ["//visibility:public"], +) diff --git a/go/extractor/canonicalize/canonicalize_other.go b/go/extractor/canonicalize/canonicalize_other.go new file mode 100644 index 000000000000..84880032456d --- /dev/null +++ b/go/extractor/canonicalize/canonicalize_other.go @@ -0,0 +1,5 @@ +//go:build !windows + +package canonicalize + +func CanonicalizePath(path string) string { return path } diff --git a/go/extractor/canonicalize/canonicalize_windows.go b/go/extractor/canonicalize/canonicalize_windows.go new file mode 100644 index 000000000000..c417d40d217d --- /dev/null +++ b/go/extractor/canonicalize/canonicalize_windows.go @@ -0,0 +1,65 @@ +//go:build windows + +package canonicalize + +import ( + "os" + "path/filepath" + "syscall" + "unsafe" +) + +var ( + dll *syscall.DLL + procCanonicalize *syscall.Proc + procFree *syscall.Proc + available bool +) + +func init() { + root := os.Getenv("CODEQL_EXTRACTOR_GO_ROOT") + if root == "" { + return + } + dllPath := filepath.Join(root, "tools", "win64", "codeql_canonical_path.dll") + d, err := syscall.LoadDLL(dllPath) + if err != nil { + return + } + p, err := d.FindProc("canonicalize_path_u8") + if err != nil { + return + } + f, _ := d.FindProc("canonicalize_free_u8") + dll = d + procCanonicalize = p + procFree = f + available = true +} + +func CanonicalizePath(path string) string { + if !available { + return path + } + pathBytes := append([]byte(path), 0) + ret, _, _ := procCanonicalize.Call(uintptr(unsafe.Pointer(&pathBytes[0]))) + if ret == 0 { + return path + } + result := bytePtrToString((*byte)(unsafe.Pointer(ret))) + if procFree != nil { + procFree.Call(ret) + } + return result +} + +func bytePtrToString(p *byte) string { + if p == nil { + return "" + } + var n int + for ptr := unsafe.Pointer(p); *(*byte)(ptr) != 0; n++ { + ptr = unsafe.Add(ptr, 1) + } + return string(unsafe.Slice(p, n)) +} diff --git a/go/extractor/extractor.go b/go/extractor/extractor.go index 158f0029704d..472b748ed6a9 100644 --- a/go/extractor/extractor.go +++ b/go/extractor/extractor.go @@ -22,6 +22,7 @@ import ( "sync" "time" + "github.com/github/codeql-go/extractor/canonicalize" "github.com/github/codeql-go/extractor/dbscheme" "github.com/github/codeql-go/extractor/diagnostics" "github.com/github/codeql-go/extractor/srcarchive" @@ -766,7 +767,7 @@ func normalizedPath(ast *ast.File, fset *token.FileSet) string { if err != nil { return file } - return path + return canonicalize.CanonicalizePath(path) } // extractFile extracts AST information for the given file diff --git a/java/kotlin-extractor/src/main/java/com/semmle/util/files/FileUtil.java b/java/kotlin-extractor/src/main/java/com/semmle/util/files/FileUtil.java index 79ce2d8d8d3d..47c53ca817dd 100644 --- a/java/kotlin-extractor/src/main/java/com/semmle/util/files/FileUtil.java +++ b/java/kotlin-extractor/src/main/java/com/semmle/util/files/FileUtil.java @@ -1242,11 +1242,11 @@ public static String relativePathLink (File f, File base) public static File tryMakeCanonical (File f) { try { - return f.getCanonicalFile(); + return NativeCanonicalizer.resolve(f.getCanonicalFile()); } catch (IOException ignored) { Exceptions.ignore(ignored, "Can't log error: Could be too verbose."); - return new File(simplifyPath(f)); + return NativeCanonicalizer.resolve(new File(simplifyPath(f))); } } diff --git a/java/kotlin-extractor/src/main/java/com/semmle/util/files/NativeCanonicalizer.java b/java/kotlin-extractor/src/main/java/com/semmle/util/files/NativeCanonicalizer.java new file mode 100644 index 000000000000..61f665670fd4 --- /dev/null +++ b/java/kotlin-extractor/src/main/java/com/semmle/util/files/NativeCanonicalizer.java @@ -0,0 +1,41 @@ +package com.semmle.util.files; + +import java.io.File; +import java.nio.file.Path; +import java.nio.file.Paths; + +public class NativeCanonicalizer { + private static final boolean available; + + static { + boolean loaded = false; + if (File.separatorChar == '\\') { + String dist = System.getenv("CODEQL_DIST"); + if (dist != null && !dist.isEmpty()) { + try { + Path library = Paths.get(dist).resolve("tools").resolve("win64") + .resolve("codeql_canonical_path.dll").toAbsolutePath(); + System.load(library.toString()); + loaded = true; + } catch (RuntimeException | UnsatisfiedLinkError ignored) { + } + } + } + available = loaded; + } + + private NativeCanonicalizer() {} + + // UTF-16 JNI interface - no encoding conversion + private static native String nativeCanonicalizePath(String path); + + public static File resolve(File path) { + if (!available) return path; + String result = nativeCanonicalizePath(path.getAbsolutePath()); + return result != null ? new File(result) : path; + } + + public static boolean isAvailable() { + return available; + } +} diff --git a/shared/canonicalize/BUILD.bazel b/shared/canonicalize/BUILD.bazel new file mode 100644 index 000000000000..afb18fc29bd3 --- /dev/null +++ b/shared/canonicalize/BUILD.bazel @@ -0,0 +1,35 @@ +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") +load("@rules_pkg//pkg:mappings.bzl", "pkg_attributes", "pkg_files") + +cc_binary( + name = "codeql_canonical_path.dll", + srcs = [ + "canonicalize.cpp", + "canonicalize.h", + "canonicalize_jni.cpp", + ], + defines = ["CODEQL_CANONICALIZE_EXPORTS"], + linkopts = ["-lkernel32"], + linkshared = True, + target_compatible_with = ["@platforms//os:windows"], + visibility = ["//visibility:public"], + deps = ["@rules_java//toolchains:jni"], +) + +cc_library( + name = "canonicalize", + srcs = ["canonicalize.cpp"], + hdrs = ["canonicalize.h"], + defines = ["CODEQL_CANONICALIZE_EXPORTS"], + linkopts = ["-lkernel32"], + target_compatible_with = ["@platforms//os:windows"], + visibility = ["//visibility:public"], +) + +pkg_files( + name = "pkg", + srcs = [":codeql_canonical_path.dll"], + attributes = pkg_attributes(mode = "0755"), + target_compatible_with = ["@platforms//os:windows"], + visibility = ["//visibility:public"], +) diff --git a/shared/canonicalize/canonicalize.cpp b/shared/canonicalize/canonicalize.cpp new file mode 100644 index 000000000000..e09b80f42d01 --- /dev/null +++ b/shared/canonicalize/canonicalize.cpp @@ -0,0 +1,165 @@ +#ifdef _WIN32 +#include "canonicalize.h" +#include +#include +#include +#include +#include + +namespace { + +class PathCache { +public: + static PathCache& instance() { + static PathCache cache; + return cache; + } + + const wchar_t* canonicalize(const wchar_t* path) { + std::wstring key(path); + + // Fast path: shared (read) lock for cache hit + { + std::shared_lock lock(mutex_); + auto it = cache_.find(key); + if (it != cache_.end()) { + return _wcsdup(it->second.c_str()); + } + } + + // Slow path: resolve and insert under exclusive lock + std::wstring resolved = resolve(path); + if (resolved.empty()) return nullptr; + + std::unique_lock lock(mutex_); + // Check again under exclusive lock (another thread may have inserted) + auto it = cache_.find(key); + if (it != cache_.end()) { + return _wcsdup(it->second.c_str()); + } + + // Evict a random entry if at capacity (matches C# strategy) + if (cache_.size() >= max_capacity_) { + std::uniform_int_distribution dist(0, cache_.size() - 1); + auto evict = cache_.begin(); + std::advance(evict, dist(rng_)); + cache_.erase(evict); + } + + auto inserted = cache_.emplace(std::move(key), std::move(resolved)).first; + return _wcsdup(inserted->second.c_str()); + } + +private: + PathCache() = default; + + static constexpr size_t max_capacity_ = 4096; + std::unordered_map cache_; + std::shared_mutex mutex_; + std::mt19937 rng_{std::random_device{}()}; + + static std::wstring resolve(const wchar_t* path) { + HANDLE h = CreateFileW( + path, + 0, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, + nullptr); + + if (h == INVALID_HANDLE_VALUE) { + return resolve_nonexistent(path); + } + + std::wstring result = get_final_path(h); + CloseHandle(h); + + if (result.empty()) return {}; + return strip_prefix(result); + } + + static std::wstring get_final_path(HANDLE h) { + wchar_t buf[MAX_PATH]; + DWORD len = GetFinalPathNameByHandleW(h, buf, MAX_PATH, FILE_NAME_NORMALIZED); + + if (len > 0 && len < MAX_PATH) { + return std::wstring(buf, len); + } + if (len >= MAX_PATH) { + std::wstring big(len + 1, L'\0'); + len = GetFinalPathNameByHandleW(h, big.data(), len + 1, FILE_NAME_NORMALIZED); + if (len > 0) return std::wstring(big.data(), len); + } + return {}; + } + + static std::wstring strip_prefix(const std::wstring& path) { + constexpr std::wstring_view unc_prefix = L"\\\\?\\UNC\\"; + constexpr std::wstring_view lp_prefix = L"\\\\?\\"; + + if (path.starts_with(unc_prefix)) { + return L"\\" + path.substr(unc_prefix.size() - 1); + } + if (path.starts_with(lp_prefix)) { + return std::wstring(path.substr(lp_prefix.size())); + } + return path; + } + + // For non-existent files: canonicalize parent, append filename + // (matches C#'s ConstructCanonicalPath) + static std::wstring resolve_nonexistent(const wchar_t* path) { + std::wstring spath(path); + auto sep = spath.find_last_of(L"\\/"); + if (sep == std::wstring::npos) return {}; + + std::wstring parent = spath.substr(0, sep); + std::wstring name = spath.substr(sep + 1); + + std::wstring canonical_parent = resolve(parent.c_str()); + if (canonical_parent.empty()) return {}; + + return canonical_parent + L"\\" + name; + } +}; + +} // namespace + +extern "C" { + +CODEQL_API const wchar_t* canonicalize_path_w(const wchar_t* path) { + if (!path || !*path) return nullptr; + return PathCache::instance().canonicalize(path); +} + +CODEQL_API void canonicalize_free_w(const wchar_t* path) { + free(const_cast(path)); +} + +CODEQL_API const char* canonicalize_path_u8(const char* path) { + if (!path || !*path) return nullptr; + + int wlen = MultiByteToWideChar(CP_UTF8, 0, path, -1, nullptr, 0); + if (wlen <= 0) return nullptr; + std::wstring wpath(wlen - 1, L'\0'); + MultiByteToWideChar(CP_UTF8, 0, path, -1, wpath.data(), wlen); + + const wchar_t* wresult = PathCache::instance().canonicalize(wpath.c_str()); + if (!wresult) return nullptr; + + int ulen = WideCharToMultiByte(CP_UTF8, 0, wresult, -1, nullptr, 0, nullptr, nullptr); + if (ulen <= 0) { free(const_cast(wresult)); return nullptr; } + char* result = static_cast(malloc(ulen)); + WideCharToMultiByte(CP_UTF8, 0, wresult, -1, result, ulen, nullptr, nullptr); + free(const_cast(wresult)); + + return result; +} + +CODEQL_API void canonicalize_free_u8(const char* path) { + free(const_cast(path)); +} + +} // extern "C" +#endif diff --git a/shared/canonicalize/canonicalize.h b/shared/canonicalize/canonicalize.h new file mode 100644 index 000000000000..ce852145772e --- /dev/null +++ b/shared/canonicalize/canonicalize.h @@ -0,0 +1,31 @@ +#ifndef CODEQL_CANONICALIZE_H +#define CODEQL_CANONICALIZE_H + +#ifdef _WIN32 + +#ifdef CODEQL_CANONICALIZE_EXPORTS +#define CODEQL_API __declspec(dllexport) +#else +#define CODEQL_API __declspec(dllimport) +#endif + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// UTF-16 interface (for JNI / Java / Kotlin) +CODEQL_API const wchar_t* canonicalize_path_w(const wchar_t* path); +CODEQL_API void canonicalize_free_w(const wchar_t* path); + +// UTF-8 interface (for Go) +CODEQL_API const char* canonicalize_path_u8(const char* path); +CODEQL_API void canonicalize_free_u8(const char* path); + +#ifdef __cplusplus +} +#endif + +#endif // _WIN32 +#endif // CODEQL_CANONICALIZE_H diff --git a/shared/canonicalize/canonicalize_jni.cpp b/shared/canonicalize/canonicalize_jni.cpp new file mode 100644 index 000000000000..0bb8dc4fc482 --- /dev/null +++ b/shared/canonicalize/canonicalize_jni.cpp @@ -0,0 +1,25 @@ +#ifdef _WIN32 +#include +#include "canonicalize.h" + +extern "C" { + +JNIEXPORT jstring JNICALL +Java_com_semmle_util_files_NativeCanonicalizer_nativeCanonicalizePath( + JNIEnv *env, jclass cls, jstring jpath) { + + const jchar* path = env->GetStringChars(jpath, nullptr); + const wchar_t* result = canonicalize_path_w(reinterpret_cast(path)); + env->ReleaseStringChars(jpath, path); + + if (result == nullptr) return nullptr; + + jstring jresult = env->NewString( + reinterpret_cast(result), + static_cast(wcslen(result))); + canonicalize_free_w(result); + return jresult; +} + +} // extern "C" +#endif