ci(issues): add LLM-driven duplicate issue detection (#2381)
This commit is contained in:
72
.github/workflows/detect-duplicate.yml
vendored
Normal file
72
.github/workflows/detect-duplicate.yml
vendored
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json
|
||||||
|
name: Duplicate Issue Detector
|
||||||
|
|
||||||
|
on:
|
||||||
|
issues:
|
||||||
|
types: [opened]
|
||||||
|
|
||||||
|
permissions: {}
|
||||||
|
|
||||||
|
env:
|
||||||
|
EMBEDDING_MODEL: ${{ vars.EMBEDDING_MODEL }}
|
||||||
|
GROQ_MODEL: ${{ vars.GROQ_MODEL }}
|
||||||
|
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
detect-duplicate:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
if: ${{ !github.event.issue.pull_request }}
|
||||||
|
permissions:
|
||||||
|
issues: write
|
||||||
|
actions: read
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
|
||||||
|
|
||||||
|
- name: Set up Node.js
|
||||||
|
uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
|
||||||
|
with:
|
||||||
|
node-version-file: 'package.json'
|
||||||
|
|
||||||
|
- name: Cache embedding model
|
||||||
|
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/huggingface
|
||||||
|
key: hf-model-${{ vars.EMBEDDING_MODEL }}
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
working-directory: bin/duplicate-detector
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Download issue index
|
||||||
|
uses: dawidd6/action-download-artifact@5c98f0b039f36ef966fdb7dfa9779262785ecb05 # v14
|
||||||
|
with:
|
||||||
|
name: issue-index
|
||||||
|
workflow: rebuild-issue-index.yml
|
||||||
|
path: bin/duplicate-detector
|
||||||
|
search_artifacts: true
|
||||||
|
if_no_artifact_found: warn
|
||||||
|
|
||||||
|
- name: Build index if missing
|
||||||
|
working-directory: bin/duplicate-detector
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||||
|
INDEX_PATH: issue_index.json
|
||||||
|
run: |
|
||||||
|
if [ ! -f issue_index.json ]; then
|
||||||
|
echo "No index found — building from scratch..."
|
||||||
|
node build-index.mjs
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Detect duplicates
|
||||||
|
working-directory: bin/duplicate-detector
|
||||||
|
continue-on-error: true
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||||
|
ISSUE_NUMBER: ${{ github.event.issue.number }}
|
||||||
|
INDEX_PATH: issue_index.json
|
||||||
|
run: node detect.mjs
|
||||||
54
.github/workflows/rebuild-issue-index.yml
vendored
Normal file
54
.github/workflows/rebuild-issue-index.yml
vendored
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json
|
||||||
|
name: Rebuild Issue Index
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: "0 3 * * *"
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions: {}
|
||||||
|
|
||||||
|
env:
|
||||||
|
EMBEDDING_MODEL: ${{ vars.EMBEDDING_MODEL }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-index:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
permissions:
|
||||||
|
issues: read
|
||||||
|
actions: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
|
||||||
|
|
||||||
|
- name: Set up Node.js
|
||||||
|
uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5
|
||||||
|
with:
|
||||||
|
node-version-file: 'package.json'
|
||||||
|
|
||||||
|
- name: Cache embedding model
|
||||||
|
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
|
||||||
|
with:
|
||||||
|
path: ~/.cache/huggingface
|
||||||
|
key: hf-model-${{ vars.EMBEDDING_MODEL }}
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
working-directory: bin/duplicate-detector
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Build issue index
|
||||||
|
working-directory: bin/duplicate-detector
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||||
|
INDEX_PATH: issue_index.json
|
||||||
|
run: node build-index.mjs
|
||||||
|
|
||||||
|
- name: Upload index artifact
|
||||||
|
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
|
||||||
|
with:
|
||||||
|
name: issue-index
|
||||||
|
path: bin/duplicate-detector/issue_index.json
|
||||||
|
retention-days: 7
|
||||||
120
bin/duplicate-detector/build-index.mjs
Normal file
120
bin/duplicate-detector/build-index.mjs
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
/**
|
||||||
|
* Build Issue Embedding Index
|
||||||
|
*
|
||||||
|
* Fetches all open issues and recently closed ones,
|
||||||
|
* generates embeddings using a local ONNX transformer model,
|
||||||
|
* and saves them as a JSON artifact for the duplicate detector.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pipeline } from '@huggingface/transformers';
|
||||||
|
import { mkdirSync, writeFileSync } from 'node:fs';
|
||||||
|
import { dirname } from 'node:path';
|
||||||
|
import { fetchIssues, issueText } from './utils.mjs';
|
||||||
|
|
||||||
|
const MODEL_NAME = process.env.EMBEDDING_MODEL || 'Xenova/all-MiniLM-L6-v2';
|
||||||
|
const OUTPUT_PATH = 'issue_index.json';
|
||||||
|
const INCLUDE_CLOSED_DAYS = 90;
|
||||||
|
const MAX_ISSUES = 5000;
|
||||||
|
const BATCH_SIZE = 64;
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Fetching open issues...');
|
||||||
|
const openIssues = await fetchIssues({
|
||||||
|
state: 'open',
|
||||||
|
maxIssues: MAX_ISSUES,
|
||||||
|
});
|
||||||
|
console.log(`Fetched ${openIssues.length} open issues`);
|
||||||
|
|
||||||
|
const since = new Date(
|
||||||
|
Date.now() - INCLUDE_CLOSED_DAYS * 24 * 60 * 60 * 1000
|
||||||
|
).toISOString();
|
||||||
|
console.log(
|
||||||
|
`Fetching closed issues from last ${INCLUDE_CLOSED_DAYS} days...`
|
||||||
|
);
|
||||||
|
|
||||||
|
const closedIssues = await fetchIssues({
|
||||||
|
state: 'closed',
|
||||||
|
since,
|
||||||
|
maxIssues: MAX_ISSUES,
|
||||||
|
});
|
||||||
|
console.log(`Fetched ${closedIssues.length} closed issues`);
|
||||||
|
let allIssues = [...openIssues, ...closedIssues];
|
||||||
|
|
||||||
|
const seen = new Set();
|
||||||
|
allIssues = allIssues.filter((issue) => {
|
||||||
|
if (seen.has(issue.number)) return false;
|
||||||
|
seen.add(issue.number);
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(`Total unique issues to index: ${allIssues.length}`);
|
||||||
|
|
||||||
|
if (allIssues.length === 0) {
|
||||||
|
console.warn('No issues found - writing empty index');
|
||||||
|
writeFileSync(OUTPUT_PATH, JSON.stringify({ issues: [], embeddings: [] }));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Loading model: ${MODEL_NAME}`);
|
||||||
|
const extractor = await pipeline('feature-extraction', MODEL_NAME, {
|
||||||
|
dtype: 'fp32',
|
||||||
|
});
|
||||||
|
|
||||||
|
const texts = allIssues.map((issue) => issueText(issue.title, issue.body));
|
||||||
|
const allEmbeddings = [];
|
||||||
|
|
||||||
|
console.log(`Generating embeddings for ${texts.length} issues...`);
|
||||||
|
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
||||||
|
const batch = texts.slice(i, i + BATCH_SIZE);
|
||||||
|
const output = await extractor(batch, {
|
||||||
|
pooling: 'mean',
|
||||||
|
normalize: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const vectors = output.tolist();
|
||||||
|
allEmbeddings.push(...vectors);
|
||||||
|
|
||||||
|
const progress = Math.min(i + BATCH_SIZE, texts.length);
|
||||||
|
console.log(` ${progress}/${texts.length}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const issueMetadata = allIssues.map((issue) => {
|
||||||
|
const body = (issue.body || '').trim();
|
||||||
|
return {
|
||||||
|
number: issue.number,
|
||||||
|
title: issue.title,
|
||||||
|
state: issue.state,
|
||||||
|
url: issue.html_url,
|
||||||
|
body_preview: body.slice(0, 500) || '',
|
||||||
|
labels: (issue.labels || []).map((l) => l.name),
|
||||||
|
created_at: issue.created_at,
|
||||||
|
updated_at: issue.updated_at,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
const indexData = {
|
||||||
|
issues: issueMetadata,
|
||||||
|
embeddings: allEmbeddings,
|
||||||
|
model: MODEL_NAME,
|
||||||
|
issue_count: issueMetadata.length,
|
||||||
|
built_at: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const dir = dirname(OUTPUT_PATH);
|
||||||
|
if (dir && dir !== '.') mkdirSync(dir, { recursive: true });
|
||||||
|
writeFileSync(OUTPUT_PATH, JSON.stringify(indexData));
|
||||||
|
|
||||||
|
const sizeMb = (
|
||||||
|
Buffer.byteLength(JSON.stringify(indexData)) /
|
||||||
|
(1024 * 1024)
|
||||||
|
).toFixed(1);
|
||||||
|
console.log(
|
||||||
|
`Index saved to ${OUTPUT_PATH} (${sizeMb} MB, ${issueMetadata.length} issues)`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
274
bin/duplicate-detector/detect.mjs
Normal file
274
bin/duplicate-detector/detect.mjs
Normal file
@@ -0,0 +1,274 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
/**
|
||||||
|
* Duplicate Issue Detector
|
||||||
|
*
|
||||||
|
* Triggered on new issue creation. Compares the new issue against an
|
||||||
|
* existing embedding index, then uses an LLM to
|
||||||
|
* confirm duplicates before posting a comment for maintainer review.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pipeline } from '@huggingface/transformers';
|
||||||
|
import { existsSync, readFileSync } from 'node:fs';
|
||||||
|
import {
|
||||||
|
addLabel,
|
||||||
|
dotProduct,
|
||||||
|
fetchIssues,
|
||||||
|
getIssue,
|
||||||
|
issueText,
|
||||||
|
postComment,
|
||||||
|
} from './utils.mjs';
|
||||||
|
|
||||||
|
const SIMILARITY_THRESHOLD = 0.55;
|
||||||
|
const TOP_K = 5;
|
||||||
|
const MAX_COMMENT_CANDIDATES = 3;
|
||||||
|
const MODEL_NAME = process.env.EMBEDDING_MODEL || 'Xenova/all-MiniLM-L6-v2';
|
||||||
|
const GROQ_MODEL = process.env.GROQ_MODEL || 'llama-3.3-70b-versatile';
|
||||||
|
const INDEX_PATH = 'issue_index.json';
|
||||||
|
const LABEL_NAME = 'possible-duplicate';
|
||||||
|
|
||||||
|
const GROQ_API_KEY = process.env.GROQ_API_KEY || '';
|
||||||
|
const ISSUE_NUMBER = parseInt(process.env.ISSUE_NUMBER, 10);
|
||||||
|
|
||||||
|
function loadIndex(path) {
|
||||||
|
if (!existsSync(path)) {
|
||||||
|
console.error(
|
||||||
|
`Index file not found at ${path}. Run build-index.mjs first.`
|
||||||
|
);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = JSON.parse(readFileSync(path, 'utf-8'));
|
||||||
|
console.log(`Loaded index with ${data.issues.length} issues`);
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
function findSimilar(
|
||||||
|
queryEmbedding,
|
||||||
|
index,
|
||||||
|
{ topK = TOP_K, threshold = SIMILARITY_THRESHOLD, excludeNumber } = {}
|
||||||
|
) {
|
||||||
|
const { issues, embeddings } = index;
|
||||||
|
if (!issues.length) return [];
|
||||||
|
|
||||||
|
const scored = issues.map((issue, i) => ({
|
||||||
|
...issue,
|
||||||
|
score: dotProduct(queryEmbedding, embeddings[i]),
|
||||||
|
}));
|
||||||
|
|
||||||
|
return scored
|
||||||
|
.sort((a, b) => b.score - a.score)
|
||||||
|
.filter(
|
||||||
|
(c) =>
|
||||||
|
c.score >= threshold && (!excludeNumber || c.number !== excludeNumber)
|
||||||
|
)
|
||||||
|
.slice(0, topK);
|
||||||
|
}
|
||||||
|
|
||||||
|
const CONFIRM_SYSTEM_PROMPT = `You are a GitHub issue triage assistant. You will be given a NEW issue and one \
|
||||||
|
or more CANDIDATE issues that may be duplicates.
|
||||||
|
|
||||||
|
For each candidate, determine if the new issue is truly a duplicate (same root \
|
||||||
|
problem/request) or merely related (similar area but different issue).
|
||||||
|
|
||||||
|
Respond ONLY with a JSON array of objects, each with:
|
||||||
|
- "number": the candidate issue number
|
||||||
|
- "duplicate": true or false
|
||||||
|
- "reason": one-sentence explanation
|
||||||
|
|
||||||
|
Example:
|
||||||
|
[{"number": 123, "duplicate": true, "reason": "Both report the same crash when ..."}]`;
|
||||||
|
|
||||||
|
async function confirmWithLlm(newIssue, candidates) {
|
||||||
|
if (!GROQ_API_KEY) {
|
||||||
|
console.warn('GROQ_API_KEY not set — skipping LLM confirmation');
|
||||||
|
return candidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
const candidateText = candidates
|
||||||
|
.map(
|
||||||
|
(c) =>
|
||||||
|
`### Candidate #${c.number} (similarity: ${c.score.toFixed(2)})\n` +
|
||||||
|
`**Title:** ${c.title}\n` +
|
||||||
|
`**State:** ${c.state}\n` +
|
||||||
|
`**Body preview:** ${(c.body_preview || 'N/A').slice(0, 500)}`
|
||||||
|
)
|
||||||
|
.join('\n\n');
|
||||||
|
|
||||||
|
const userPrompt =
|
||||||
|
`## NEW ISSUE #${newIssue.number}\n` +
|
||||||
|
`**Title:** ${newIssue.title}\n` +
|
||||||
|
`**Body:**\n${(newIssue.body || 'No body').slice(0, 1500)}\n\n` +
|
||||||
|
`---\n\n` +
|
||||||
|
`## CANDIDATES\n${candidateText}`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resp = await fetch(
|
||||||
|
'https://api.groq.com/openai/v1/chat/completions',
|
||||||
|
{
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${GROQ_API_KEY}`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: GROQ_MODEL,
|
||||||
|
messages: [
|
||||||
|
{ role: 'system', content: CONFIRM_SYSTEM_PROMPT },
|
||||||
|
{ role: 'user', content: userPrompt },
|
||||||
|
],
|
||||||
|
temperature: 0.1,
|
||||||
|
max_tokens: 1024,
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(30_000),
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!resp.ok) {
|
||||||
|
const text = await resp.text();
|
||||||
|
throw new Error(`Groq API error ${resp.status}: ${text}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = (await resp.json()).choices[0].message.content.trim();
|
||||||
|
|
||||||
|
if (content.startsWith('```')) {
|
||||||
|
content = content
|
||||||
|
.split('\n')
|
||||||
|
.slice(1)
|
||||||
|
.join('\n')
|
||||||
|
.replace(/```\s*$/, '')
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const verdicts = JSON.parse(content);
|
||||||
|
if (!Array.isArray(verdicts)) {
|
||||||
|
throw new Error('Invalid LLM response format - expected array');
|
||||||
|
}
|
||||||
|
|
||||||
|
const verdictMap = new Map(verdicts.map((v) => [v.number, v]));
|
||||||
|
|
||||||
|
const confirmed = [];
|
||||||
|
for (const c of candidates) {
|
||||||
|
const verdict = verdictMap.get(c.number);
|
||||||
|
if (verdict?.duplicate) {
|
||||||
|
c.llm_reason = verdict.reason || '';
|
||||||
|
confirmed.push(c);
|
||||||
|
} else {
|
||||||
|
const reason = verdict?.reason || 'not evaluated';
|
||||||
|
console.log(` #${c.number} ruled out by LLM: ${reason}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return confirmed;
|
||||||
|
} catch (err) {
|
||||||
|
console.warn(
|
||||||
|
`LLM confirmation failed: ${err.message} - falling back to all candidates`
|
||||||
|
);
|
||||||
|
return candidates;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatComment(candidates) {
|
||||||
|
const lines = [
|
||||||
|
'**Possible duplicate detected**',
|
||||||
|
'',
|
||||||
|
'This issue may be a duplicate of the following (detected via semantic similarity + LLM review):',
|
||||||
|
'',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const c of candidates.slice(0, MAX_COMMENT_CANDIDATES)) {
|
||||||
|
const confidence = `${(c.score * 100).toFixed(0)}%`;
|
||||||
|
let line = `- #${c.number} (${confidence} match) — ${c.title}`;
|
||||||
|
if (c.llm_reason) {
|
||||||
|
line += `\n > *${c.llm_reason}*`;
|
||||||
|
}
|
||||||
|
lines.push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.push(
|
||||||
|
'',
|
||||||
|
'A maintainer will review this. If this is **not** a duplicate, no action is needed.',
|
||||||
|
'',
|
||||||
|
`<!-- duplicate-bot: candidates=${candidates.map((c) => c.number).join(',')} -->`
|
||||||
|
);
|
||||||
|
|
||||||
|
return lines.join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
if (!ISSUE_NUMBER) {
|
||||||
|
console.error('ISSUE_NUMBER not set');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Processing issue #${ISSUE_NUMBER}`);
|
||||||
|
const issue = await getIssue(ISSUE_NUMBER);
|
||||||
|
|
||||||
|
const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000).toISOString();
|
||||||
|
const recentIssues = await fetchIssues({
|
||||||
|
creator: issue.user.login,
|
||||||
|
since: oneHourAgo,
|
||||||
|
state: 'all',
|
||||||
|
});
|
||||||
|
|
||||||
|
if (recentIssues.length > 10) {
|
||||||
|
console.log(
|
||||||
|
`User ${issue.user.login} created ${recentIssues.length} issues in the last hour - skipping to prevent spam`
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (issue.pull_request) {
|
||||||
|
console.log('Skipping - this is a pull request');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (issue.user.type === 'Bot') {
|
||||||
|
console.log('Skipping - issue created by bot');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Loading model: ${MODEL_NAME}`);
|
||||||
|
const extractor = await pipeline('feature-extraction', MODEL_NAME, {
|
||||||
|
dtype: 'fp32',
|
||||||
|
});
|
||||||
|
const index = loadIndex(INDEX_PATH);
|
||||||
|
|
||||||
|
const text = issueText(issue.title, issue.body);
|
||||||
|
const output = await extractor(text, { pooling: 'mean', normalize: true });
|
||||||
|
const queryEmbedding = output.tolist()[0];
|
||||||
|
|
||||||
|
let candidates = findSimilar(queryEmbedding, index, {
|
||||||
|
topK: TOP_K,
|
||||||
|
threshold: SIMILARITY_THRESHOLD,
|
||||||
|
excludeNumber: issue.number,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!candidates.length) {
|
||||||
|
console.log('No similar issues found above threshold - done');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Found ${candidates.length} candidates above threshold:`);
|
||||||
|
for (const c of candidates) {
|
||||||
|
console.log(` #${c.number} (${c.score.toFixed(3)}) - ${c.title}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('Running LLM confirmation via Groq...');
|
||||||
|
candidates = await confirmWithLlm(issue, candidates);
|
||||||
|
|
||||||
|
if (!candidates.length) {
|
||||||
|
console.log('LLM ruled out all candidates - done');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const comment = formatComment(candidates);
|
||||||
|
await postComment(ISSUE_NUMBER, comment);
|
||||||
|
await addLabel(ISSUE_NUMBER, LABEL_NAME);
|
||||||
|
|
||||||
|
console.log('Done!');
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
1043
bin/duplicate-detector/package-lock.json
generated
Normal file
1043
bin/duplicate-detector/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
13
bin/duplicate-detector/package.json
Normal file
13
bin/duplicate-detector/package.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"name": "duplicate-detector",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"private": true,
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"build-index": "node build-index.mjs",
|
||||||
|
"detect": "node detect.mjs"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@huggingface/transformers": "^3.8.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
116
bin/duplicate-detector/utils.mjs
Normal file
116
bin/duplicate-detector/utils.mjs
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
const GITHUB_API = 'https://api.github.com';
|
||||||
|
const GITHUB_TOKEN = process.env.GITHUB_TOKEN;
|
||||||
|
const GITHUB_REPOSITORY = process.env.GITHUB_REPOSITORY;
|
||||||
|
|
||||||
|
function ghHeaders() {
|
||||||
|
return {
|
||||||
|
Authorization: `token ${GITHUB_TOKEN}`,
|
||||||
|
Accept: 'application/vnd.github+json',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchIssues({
|
||||||
|
state = 'open',
|
||||||
|
since,
|
||||||
|
maxIssues = 5000,
|
||||||
|
} = {}) {
|
||||||
|
const issues = [];
|
||||||
|
let page = 1;
|
||||||
|
const perPage = 100;
|
||||||
|
|
||||||
|
while (issues.length < maxIssues) {
|
||||||
|
const params = new URLSearchParams({
|
||||||
|
state,
|
||||||
|
per_page: String(perPage),
|
||||||
|
page: String(page),
|
||||||
|
sort: 'updated',
|
||||||
|
direction: 'desc',
|
||||||
|
});
|
||||||
|
if (since) params.set('since', since);
|
||||||
|
|
||||||
|
const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues?${params}`;
|
||||||
|
const resp = await fetch(url, { headers: ghHeaders() });
|
||||||
|
|
||||||
|
if (!resp.ok) {
|
||||||
|
throw new Error(`GitHub API error: ${resp.status} ${resp.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const batch = await resp.json();
|
||||||
|
if (!batch.length) break;
|
||||||
|
|
||||||
|
for (const item of batch) {
|
||||||
|
if (!item.pull_request) {
|
||||||
|
issues.push(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
page++;
|
||||||
|
if (batch.length < perPage) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return issues.slice(0, maxIssues);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getIssue(issueNumber) {
|
||||||
|
const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues/${issueNumber}`;
|
||||||
|
const resp = await fetch(url, { headers: ghHeaders() });
|
||||||
|
|
||||||
|
if (!resp.ok) {
|
||||||
|
throw new Error(`GitHub API error: ${resp.status} ${resp.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp.json();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function postComment(issueNumber, body) {
|
||||||
|
const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues/${issueNumber}/comments`;
|
||||||
|
const resp = await fetch(url, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { ...ghHeaders(), 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ body }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!resp.ok) {
|
||||||
|
throw new Error(
|
||||||
|
`Failed to post comment: ${resp.status} ${resp.statusText}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Posted comment on #${issueNumber}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function addLabel(issueNumber, label) {
|
||||||
|
const url = `${GITHUB_API}/repos/${GITHUB_REPOSITORY}/issues/${issueNumber}/labels`;
|
||||||
|
const resp = await fetch(url, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { ...ghHeaders(), 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ labels: [label] }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (resp.status === 404) {
|
||||||
|
console.warn(
|
||||||
|
`Label '${label}' does not exist - skipping. Create it manually.`
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!resp.ok) {
|
||||||
|
throw new Error(`Failed to add label: ${resp.status} ${resp.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Added label '${label}' to #${issueNumber}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function issueText(title, body) {
|
||||||
|
body = (body || '').trim();
|
||||||
|
if (body.length > 2000) body = body.slice(0, 2000) + '...';
|
||||||
|
return body ? `${title}\n\n${body}` : title;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function dotProduct(a, b) {
|
||||||
|
let sum = 0;
|
||||||
|
for (let i = 0; i < a.length; i++) {
|
||||||
|
sum += a[i] * b[i];
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user