##extract metadata from archive
import tempfile
import os
import argparse
import sys
#from joblib import Parallel, delayed
import subprocess
from tqdm import tqdm
from pathlib import Path
import tarfile
import zipfile
import os
import pandas as pd # type: ignore
from IPython.display import clear_output # type: ignore
import tarfile
import os
import subprocess
import pandas as pd
import tempfile
from tqdm import tqdm
from IPython.display import clear_output
[docs]
def is_tar_archive(file):
"""
This function checks if a file is a tar archive.
Parameters:
file (str): The path to the file.
Returns:
bool: True if the file is a tar archive, False otherwise.
"""
try:
with tarfile.open(file, 'r') as tar:
return True
except tarfile.ReadError:
return False
[docs]
def search_string_in_file(file, search_string):
"""
This function searches for a string in a file.
Parameters:
file (str): The path to the file.
search_string (str): The string to search for.
Returns:
str: The line containing the string if found, otherwise an empty string.
"""
textends = ['.txt', '.json', '.xml', '.log', '.rtf', '.csv', '.tsv']
try:
if file.endswith(tuple(textends)):
with open(file, 'r') as f:
for line in f:
if search_string in line:
return line
except:
return ""
[docs]
def get_init():
"""
This function reads the showinf parameters from a configuration file.
Returns:
tuple: A tuple containing the showinf path and parameters.
"""
import configparser
config = configparser.ConfigParser()
config.read("init.ini")
showinfPath = config['showinf parameter']['showinfPath']
showinfParameter = config['showinf parameter']['showinfParameter']
return (showinfPath, showinfParameter)
[docs]
def save_to_xml(concatMetadata, outputfolder):
"""
This function saves a DataFrame to an XML file.
Parameters:
concatMetadata (DataFrame): The DataFrame to save.
outputfolder (str): The path to the output folder.
"""
concatMetadata.index = range(len(concatMetadata))
tmpConcat = concatMetadata.to_xml()
with open(os.path.join(outputfolder, "concat_extraction_results.xml"), "w") as out:
out.write(tmpConcat)
[docs]
def process_tar_gz(file_path, outputfolder, tmp=1):
"""
This function processes a tar.gz file, extracts metadata from its contents, and saves the results to an output folder.
Parameters:
file_path (str): The path to the tar.gz file.
outputfolder (str): The path to the output folder.
tmp (int): A flag indicating whether to use a temporary directory for extraction. Default is 1.
Returns:
DataFrame: A DataFrame containing the extraction results.
"""
showinfPath, showinfParameter = get_init()
resultCols = ['extractError', 'inputFile', 'outputFile', 'extension', 'outputFolder']
concatMetadata = pd.DataFrame(columns=resultCols)
results = []
# Open the tar.gz file
with tarfile.open(file_path, "r") as tar:
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
if tmp==0:
temp_dir = outputfolder
# Extract all files to the temporary directory
for member in tar.getmembers():
# Check if the member is a file (not a directory)
if member.isfile():
# Extract the file as a file-like object
tar.extract(member, temp_dir)
# Iterate over all files in the temporary directory
filelist = get_inputlist(temp_dir)
for i, file in tqdm(enumerate(filelist)):
print(f"work on {i}/{len(filelist)} files in the archive ({i/len(filelist)}%)")
# Print the file name
results.append(extract_metadata(file, outputfolder, showinfPath, showinfParameter))
clear_output(wait=True)
## save the concat
concatMetadata = pd.DataFrame(results, columns=resultCols)
return concatMetadata
# Here you can add your own code to process the file
# Call the function with the path to your tar.gz file
#process_tar_gz("/home/omero-import"tmp/extract_metadata_test/ab22_20190529_MRI.tar.gz")
"""
archive = "/home/short_test/short.tar.xz"
#files = get_inputlist('/home/ab22_20190529_MRI')
outputfolder = '/home/short_test/short_metadata'
concatMetadata = process_tar_gz(archive, outputfolder)
save_to_xml(concatMetadata, outputfolder)
"""