import pandas as pd import os import zipfile import os import re from uuid import uuid4 import pdfplumber import numpy as np from urllib.request import urlretrieve import xml.etree.ElementTree as ET import re import fitz # PyMuPDF def get_fitz_urls(pdf_path): doc = fitz.open(pdf_path) urls = [] for page in doc: for link in page.get_links(): if 'uri' in link: urls.append(link['uri']) return urls NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'} def find_pattern_in_xml(root, pattern): """ Recursively search for a regex pattern in all text fields of an XML tree. :param root: The root Element of the XML tree :param pattern: The regex pattern to search for :return: A list of matching strings """ matches = [] regex = re.compile(pattern) # Check element text if root.text: matches.extend(regex.findall(root.text)) # Check element attributes for attr_value in root.attrib.values(): matches.extend(regex.findall(attr_value)) # Recursively search in children for child in root: matches.extend(find_pattern_in_xml(child, pattern)) return matches def fetch_url(pdf_path): if (pdf_path is None): raise ValueError("Pdf has no path") urls = [] link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\\w\\.-]*)*/?)\\b" # if (method == "plumber"): full_text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: # Extract text from the page and normalize spaces text = page.extract_text() if text: full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " " # Find all URLs in the combined text found_urls = re.findall(link_pattern, full_text) urls.extend(found_urls) # elif (method == "grobid"): # paper = pdf_to_grobid(file_name) # found_urls = find_pattern_in_xml(paper, link_pattern) # urls.extend(found_urls) # os.remove(file_name) # elif (method == "fitz") fitz_urls = get_fitz_urls(pdf_path) urls.extend(fitz_urls) urls = np.unique(urls) urls = [s for s in urls if "/" in s] urls = [s for s in urls if "git" in s] # else: # raise Exception("Method unknown") return urls