Attila Simkó
big upgrade
2db37b1
import pandas as pd
import os
import zipfile
import os
import re
from uuid import uuid4
import pdfplumber
import numpy as np
from urllib.request import urlretrieve
import xml.etree.ElementTree as ET
import re
import fitz # PyMuPDF
def get_fitz_urls(pdf_path):
doc = fitz.open(pdf_path)
urls = []
for page in doc:
for link in page.get_links():
if 'uri' in link:
urls.append(link['uri'])
return urls
NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'}
def find_pattern_in_xml(root, pattern):
"""
Recursively search for a regex pattern in all text fields of an XML tree.
:param root: The root Element of the XML tree
:param pattern: The regex pattern to search for
:return: A list of matching strings
"""
matches = []
regex = re.compile(pattern)
# Check element text
if root.text:
matches.extend(regex.findall(root.text))
# Check element attributes
for attr_value in root.attrib.values():
matches.extend(regex.findall(attr_value))
# Recursively search in children
for child in root:
matches.extend(find_pattern_in_xml(child, pattern))
return matches
def fetch_url(pdf_path):
if (pdf_path is None):
raise ValueError("Pdf has no path")
urls = []
link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\\w\\.-]*)*/?)\\b"
# if (method == "plumber"):
full_text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract text from the page and normalize spaces
text = page.extract_text()
if text:
full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " "
# Find all URLs in the combined text
found_urls = re.findall(link_pattern, full_text)
urls.extend(found_urls)
# elif (method == "grobid"):
# paper = pdf_to_grobid(file_name)
# found_urls = find_pattern_in_xml(paper, link_pattern)
# urls.extend(found_urls)
# os.remove(file_name)
# elif (method == "fitz")
fitz_urls = get_fitz_urls(pdf_path)
urls.extend(fitz_urls)
urls = np.unique(urls)
urls = [s for s in urls if "/" in s]
urls = [s for s in urls if "git" in s]
# else:
# raise Exception("Method unknown")
return urls