camie-tagger / model /tag_series_grouper.py
Camais03's picture
V1.5
29b445b verified
#!/usr/bin/env python3
"""
Tag Series Grouper
This script analyzes metadata.json to find character tags that belong to specific series
and groups them by series for creating specialized tag mosaics.
Example:
- firefly_(honkai:_star_rail) -> belongs to "honkai: star rail" series
- raiden_mei_(honkai_impact) -> belongs to "honkai impact" series
Usage:
python tag_series_grouper.py --metadata_path path/to/metadata.json --output_path output/series_groups.json
"""
import os
import json
import re
import argparse
from collections import defaultdict
def extract_series_from_tag(tag):
"""
Extract the series name from a tag, if present
Example: "firefly_(honkai:_star_rail)" -> "honkai: star rail"
Returns: tuple of (character_name, series_name) or (tag, None) if no series found
"""
# Regular expression to match character_(series) pattern
pattern = r'(.+)_\(([^)]+)\)$'
match = re.match(pattern, tag)
if match:
character = match.group(1)
series = match.group(2)
# Clean up series name
series = series.replace('_', ' ')
return character, series
return tag, None
def find_series_tags(metadata_path):
"""
Process metadata.json to find and group tags by series
Args:
metadata_path: Path to metadata.json file
Returns:
Dictionary with series information and tag groups
"""
if not os.path.exists(metadata_path):
raise FileNotFoundError(f"Metadata file not found: {metadata_path}")
# Load metadata
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
# Get tags
tags = []
if 'idx_to_tag' in metadata:
# Get tags from idx_to_tag mapping
tags = list(metadata['idx_to_tag'].values())
elif 'tag_to_idx' in metadata:
# Get tags from tag_to_idx mapping
tags = list(metadata['tag_to_idx'].keys())
# Process tags to find series
series_tags = defaultdict(list)
characters_by_series = defaultdict(set)
character_to_series = {}
tags_with_series = []
tags_without_series = []
for tag in tags:
character, series = extract_series_from_tag(tag)
if series:
# Add to series dictionary
series_tags[series].append(tag)
characters_by_series[series].add(character)
character_to_series[character] = series
tags_with_series.append(tag)
else:
tags_without_series.append(tag)
# Sort series by number of tags (most first)
sorted_series = sorted(series_tags.keys(), key=lambda x: len(series_tags[x]), reverse=True)
# Create result dictionary
result = {
"stats": {
"total_tags": len(tags),
"series_tags": len(tags_with_series),
"regular_tags": len(tags_without_series),
"unique_series": len(series_tags),
"total_characters": sum(len(chars) for chars in characters_by_series.values())
},
"series": {},
"mosaic_configs": []
}
# Add series data
for series in sorted_series:
result["series"][series] = {
"tags": series_tags[series],
"tag_count": len(series_tags[series]),
"characters": list(characters_by_series[series]),
"character_count": len(characters_by_series[series])
}
# Add a mosaic config for each series with enough tags
if len(series_tags[series]) > 20: # Only create configs for series with more than 1 tag
series_key = series.replace(" ", "_").replace(":", "").lower()
result["mosaic_configs"].append({
"name": series_key,
"title": f"{series.title()} Collection",
"total_tags": len(series_tags[series]),
"width": 512,
"height": 512,
"pixel_size": None # Automatically calculate based on tag count
})
# Add a catch-all mosaic config for "Other Series" that combines small series
small_series_tags = 0
for series in series_tags:
if len(series_tags[series]) < 20: # Count series with less than 20 tag as "small"
small_series_tags += len(series_tags[series])
if small_series_tags > 0:
result["mosaic_configs"].append({
"name": "other_series",
"title": "Other Series Collection",
"total_tags": small_series_tags,
"width": 512,
"height": 512,
"pixel_size": None
})
# Add a mosaic config for the main collection
result["mosaic_configs"].append({
"name": "main",
"title": "Complete Collection",
"total_tags": len(tags),
"width": 1024,
"height": 1024,
"pixel_size": 4
})
return result
def save_series_data(series_data, output_path):
"""Save the series data to a JSON file"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(series_data, f, indent=2)
print(f"Series data saved to {output_path}")
print(f"Found {series_data['stats']['unique_series']} unique series")
print(f"Created {len(series_data['mosaic_configs'])} mosaic configurations")
def find_and_print_top_series(metadata_path, output_path=None, top_n=10):
"""Find series groupings and print top series information"""
series_data = find_series_tags(metadata_path)
# Print statistics
print(f"\nTag Series Analysis")
print(f"=================")
print(f"Total tags analyzed: {series_data['stats']['total_tags']}")
print(f"Tags with series identifier: {series_data['stats']['series_tags']}")
print(f"Regular tags: {series_data['stats']['regular_tags']}")
print(f"Unique series found: {series_data['stats']['unique_series']}")
print(f"Total characters across series: {series_data['stats']['total_characters']}")
# Print top series by tag count
print(f"\nTop {top_n} Series by Tag Count:")
print(f"=========================")
sorted_series = sorted(series_data['series'].items(), key=lambda x: x[1]['tag_count'], reverse=True)
for i, (series, data) in enumerate(sorted_series[:top_n]):
print(f"{i+1}. {series.title()}: {data['tag_count']} tags, {data['character_count']} characters")
# Save to file if output path provided
if output_path:
save_series_data(series_data, output_path)
return series_data
def main():
parser = argparse.ArgumentParser(description='Group tags by series from metadata.json')
parser.add_argument('--metadata_path', type=str, required=True, help='Path to metadata.json')
parser.add_argument('--output_path', type=str, default='series_groups.json', help='Output path for series groups')
parser.add_argument('--top_n', type=int, default=10, help='Number of top series to display')
args = parser.parse_args()
try:
find_and_print_top_series(args.metadata_path, args.output_path, args.top_n)
except Exception as e:
print(f"Error: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()