import pdfplumber
import re, json, os
from cerebras.cloud.sdk import Cerebras
def parse_pdf_to_text(file_path, context_file_path=None):
"""
Parse a PDF file into plain text, removing bulletpoints and special signs, but preserving characters like @ and .
Args:
file_path (str): Path to the PDF file.
context_file_path (str, optional): Path to the JSON context file. Defaults to None.
Returns:
str: The parsed text.
"""
try:
with pdfplumber.open(file_path) as pdf:
text = ''
for page in pdf.pages:
text += page.extract_text()
# Remove bulletpoints and special signs, but preserve characters like @ and .
text = re.sub(r'[\n\t\r]', ' ', text)
text = re.sub(r'[^\w\s\.,!?@:\-]', '', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
if context_file_path:
with open(context_file_path, 'r') as f:
context_data = json.load(f)
# You can now use the context data as needed
print("Context Data:")
print(json.dumps(context_data, indent=4))
return text
except Exception as e:
print(f"Error parsing PDF: {e}")
return None
def process_pdf(pdf_path):
try:
text = parse_pdf_to_text(pdf_path)
except Exception as e:
print(f"An error occurred: {str(e)}")
client = Cerebras(
api_key=os.environ.get("CEREBRAS_API_KEY") )
resume_schema = {
"type": "object",
"properties": {
"education": {"type": "string"},
"skills": {"type":"string"},
"languages": {"type":"string"},
"job experience": {"type":"string"},
"publications": {"type":"string"},
"location": {"type": "string"},
"phone number": {"type": "integer"},
"linkedin": {"type": "string"},
"github": {"type": "string"},
"google scholar": {"type": "string"}
},
"required": ["education","skills","job experience"],
"additionalProperties": False
}
completion = client.chat.completions.create(
model="llama-3.1-8b",
messages=[
{"role": "system", "content": f"You are a resume summarizing aganet. All information you need about the candidate is here: {text}"},
{"role": "user", "content": f"Following the given response format, summarize the relevant information about this candidate."}
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "resume_schema",
"strict": True,
"schema": resume_schema
}
}
)
# Parse the JSON response
candidate_data = json.loads(completion.choices[0].message.content)
print(json.dumps(candidate_data, indent=2))
return candidate_data