From d236be12cecedc4ca2845921e233f4fdac0a2b06 Mon Sep 17 00:00:00 2001 From: mibali Date: Mon, 4 May 2026 15:10:41 +0100 Subject: [PATCH 1/2] Clean malformed CV skills sections --- backend/server.js | 12 ++-- extension-ready/cv-export.js | 66 ++++++++++++++++- render-proxy/server.js | 12 ++-- shared/cv-tailor.js | 134 ++++++++++++++++++++++++++++++++++- tests/cv-export.test.js | 20 ++++++ tests/cv-tailor.test.js | 39 ++++++++++ 6 files changed, 271 insertions(+), 12 deletions(-) diff --git a/backend/server.js b/backend/server.js index 46eb6d3..ced4b51 100644 --- a/backend/server.js +++ b/backend/server.js @@ -358,11 +358,15 @@ app.post('/api/cv/tailor', async (req, res) => { max_tokens: 4000 }); - const tailoredCvText = tailor.ensureConfirmedSkillsIncluded( - tailor.removeTailoringMetaPhrases( - tailor.enforceTargetHeadline(result.answer, jdData.jobTitle), - jdData.company + const tailoredCvText = tailor.cleanSkillsSection( + tailor.ensureConfirmedSkillsIncluded( + tailor.removeTailoringMetaPhrases( + tailor.enforceTargetHeadline(result.answer, jdData.jobTitle), + jdData.company + ), + confirmedSkills ), + matchMap, confirmedSkills ); if (!tailoredCvText?.trim()) { diff --git a/extension-ready/cv-export.js b/extension-ready/cv-export.js index ea7537a..f6368b6 100644 --- a/extension-ready/cv-export.js +++ b/extension-ready/cv-export.js @@ -55,6 +55,11 @@ function isEntrySectionHeader(line) { .test(line.replace(/[:\-]\s*$/, '').trim()); } +function isSkillsSectionHeader(line) { + return /^(core\s+competenc(?:y|ies)|technical\s+skills?|skills|technologies|competencies|expertise)\s*[:\-]?$/i + .test(line.replace(/[:\-]\s*$/, '').trim()); +} + function isContactLine(line) { return /[\w.+-]+@[\w-]+\.\w+/.test(line) || /https?:\/\//i.test(line) @@ -128,6 +133,51 @@ function contactLink(label, url) { return `${esc(label)}`; } +function splitSkillLine(line) { + const text = String(line || '') + .replace(/\)\s*(?=[A-Z][A-Za-z/& ]{2,36}:)/g, ') ') + .replace(/([a-z)])(?=[A-Z][A-Za-z/& ]{2,36}:)/g, '$1, ') + .replace(/\s+/g, ' ') + .trim(); + + const labelled = [...text.matchAll(/(?:^|[.;,]\s*)([A-Z][A-Za-z/& ]{2,40}):\s*([\s\S]*?)(?=(?:[.;,]\s*[A-Z][A-Za-z/& ]{2,40}:)|$)/g)]; + if (labelled.length >= 2) { + return labelled.map(([, label, value]) => cleanSkillItem(`${label.trim()}: ${value.trim()}`)).filter(isUsefulSkillItem); + } + + return text + .split(/\s*(?:;|\n|•)\s*/) + .flatMap(part => part.split(/\s*,\s+(?=[A-Z][A-Za-z/& ]{2,40}:)/)) + .map(cleanSkillItem) + .filter(isUsefulSkillItem); +} + +function cleanSkillItem(item) { + return String(item || '') + .replace(/^[-•*●▪◦–—]\s*/, '') + .replace(/\.\s*Strong experience with version control systems,\s*particularly\s+Git/gi, ', Git') + .replace(/\b(?:strong|solid|excellent|deep)\s+(?:knowledge|understanding|experience)\s+of\s+/gi, '') + .replace(/\bproficiency\s+in\s+/gi, '') + .replace(/\bexpertise\s+in\s+/gi, '') + .replace(/\bfamiliarity\s+with\s+/gi, '') + .replace(/\bexperience\s+with\s+/gi, '') + .replace(/\s+/g, ' ') + .replace(/\s+([),.;:])/g, '$1') + .replace(/[.,;]\s*$/, '') + .trim(); +} + +function isUsefulSkillItem(item) { + const text = String(item || '').trim(); + if (!text || text.length < 2 || text.length > 140) return false; + if (/\b\d+\+?\s+years?\s+of\s+experience\b/i.test(text)) return false; + if (/\bat least\s+\d+\s+years?\b/i.test(text)) return false; + if (/:\s*\(?\d+\s*(?:year|yr|month)/i.test(text)) return false; + if (/\b(highly preferred|required|minimum qualifications?|related field)\b/i.test(text)) return false; + if (/\b(?:bachelor|master|degree|education:|advanced degrees?)\b/i.test(text)) return false; + return /[A-Za-z]/.test(text); +} + function formatCvToHtml(rawText) { // Strip trailing "Links:" section added by PDF/DOCX extractor — links are // already inline in the text; we don't want them duplicated at the bottom. @@ -144,6 +194,7 @@ function formatCvToHtml(rawText) { let inHeader = true; let beforeFirstSection = true; let inEntrySection = false; // true inside Experience / Education sections + let inSkillsSection = false; // Buffer for a potential company name — flushed once we know what follows: // dates → cv-entry-row with dates; short non-date → entry-row + cv-job-title; other → standalone @@ -214,6 +265,7 @@ function formatCvToHtml(rawText) { inHeader = false; beforeFirstSection = false; inEntrySection = isEntrySectionHeader(line); + inSkillsSection = isSkillsSectionHeader(line); const sectionText = line.replace(/[:\-]\s*$/, '').trim(); if (sectionText) html += `

${esc(sectionText)}

`; continue; @@ -248,12 +300,24 @@ function formatCvToHtml(rawText) { flushPendingCompany(null); afterEntryRow = false; if (!listOpen) { html += '