camie-tagger / model /tag_series_grouper.py

V1.5

29b445b verified about 1 month ago

7.49 kB

	#!/usr/bin/env python3
	"""
	Tag Series Grouper

	This script analyzes metadata.json to find character tags that belong to specific series
	and groups them by series for creating specialized tag mosaics.

	Example:
	- firefly_(honkai:_star_rail) -> belongs to "honkai: star rail" series
	- raiden_mei_(honkai_impact) -> belongs to "honkai impact" series

	Usage:
	python tag_series_grouper.py --metadata_path path/to/metadata.json --output_path output/series_groups.json
	"""

	import os
	import json
	import re
	import argparse
	from collections import defaultdict

	def extract_series_from_tag(tag):
	"""
	Extract the series name from a tag, if present
	Example: "firefly_(honkai:_star_rail)" -> "honkai: star rail"

	Returns: tuple of (character_name, series_name) or (tag, None) if no series found
	"""
	# Regular expression to match character_(series) pattern
	pattern = r'(.+)_$([^)]+)$$'
	match = re.match(pattern, tag)

	if match:
	character = match.group(1)
	series = match.group(2)

	# Clean up series name
	series = series.replace('_', ' ')

	return character, series

	return tag, None

	def find_series_tags(metadata_path):
	"""
	Process metadata.json to find and group tags by series

	Args:
	metadata_path: Path to metadata.json file

	Returns:
	Dictionary with series information and tag groups
	"""
	if not os.path.exists(metadata_path):
	raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

	# Load metadata
	with open(metadata_path, 'r', encoding='utf-8') as f:
	metadata = json.load(f)

	# Get tags
	tags = []
	if 'idx_to_tag' in metadata:
	# Get tags from idx_to_tag mapping
	tags = list(metadata['idx_to_tag'].values())
	elif 'tag_to_idx' in metadata:
	# Get tags from tag_to_idx mapping
	tags = list(metadata['tag_to_idx'].keys())

	# Process tags to find series
	series_tags = defaultdict(list)
	characters_by_series = defaultdict(set)
	character_to_series = {}
	tags_with_series = []
	tags_without_series = []

	for tag in tags:
	character, series = extract_series_from_tag(tag)

	if series:
	# Add to series dictionary
	series_tags[series].append(tag)
	characters_by_series[series].add(character)
	character_to_series[character] = series
	tags_with_series.append(tag)
	else:
	tags_without_series.append(tag)

	# Sort series by number of tags (most first)
	sorted_series = sorted(series_tags.keys(), key=lambda x: len(series_tags[x]), reverse=True)

	# Create result dictionary
	result = {
	"stats": {
	"total_tags": len(tags),
	"series_tags": len(tags_with_series),
	"regular_tags": len(tags_without_series),
	"unique_series": len(series_tags),
	"total_characters": sum(len(chars) for chars in characters_by_series.values())
	},
	"series": {},
	"mosaic_configs": []
	}

	# Add series data
	for series in sorted_series:
	result["series"][series] = {
	"tags": series_tags[series],
	"tag_count": len(series_tags[series]),
	"characters": list(characters_by_series[series]),
	"character_count": len(characters_by_series[series])
	}

	# Add a mosaic config for each series with enough tags
	if len(series_tags[series]) > 20: # Only create configs for series with more than 1 tag
	series_key = series.replace(" ", "_").replace(":", "").lower()
	result["mosaic_configs"].append({
	"name": series_key,
	"title": f"{series.title()} Collection",
	"total_tags": len(series_tags[series]),
	"width": 512,
	"height": 512,
	"pixel_size": None # Automatically calculate based on tag count
	})

	# Add a catch-all mosaic config for "Other Series" that combines small series
	small_series_tags = 0
	for series in series_tags:
	if len(series_tags[series]) < 20: # Count series with less than 20 tag as "small"
	small_series_tags += len(series_tags[series])

	if small_series_tags > 0:
	result["mosaic_configs"].append({
	"name": "other_series",
	"title": "Other Series Collection",
	"total_tags": small_series_tags,
	"width": 512,
	"height": 512,
	"pixel_size": None
	})

	# Add a mosaic config for the main collection
	result["mosaic_configs"].append({
	"name": "main",
	"title": "Complete Collection",
	"total_tags": len(tags),
	"width": 1024,
	"height": 1024,
	"pixel_size": 4
	})

	return result

	def save_series_data(series_data, output_path):
	"""Save the series data to a JSON file"""
	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(series_data, f, indent=2)

	print(f"Series data saved to {output_path}")
	print(f"Found {series_data['stats']['unique_series']} unique series")
	print(f"Created {len(series_data['mosaic_configs'])} mosaic configurations")

	def find_and_print_top_series(metadata_path, output_path=None, top_n=10):
	"""Find series groupings and print top series information"""
	series_data = find_series_tags(metadata_path)

	# Print statistics
	print(f"\nTag Series Analysis")
	print(f"=================")
	print(f"Total tags analyzed: {series_data['stats']['total_tags']}")
	print(f"Tags with series identifier: {series_data['stats']['series_tags']}")
	print(f"Regular tags: {series_data['stats']['regular_tags']}")
	print(f"Unique series found: {series_data['stats']['unique_series']}")
	print(f"Total characters across series: {series_data['stats']['total_characters']}")

	# Print top series by tag count
	print(f"\nTop {top_n} Series by Tag Count:")
	print(f"=========================")
	sorted_series = sorted(series_data['series'].items(), key=lambda x: x[1]['tag_count'], reverse=True)

	for i, (series, data) in enumerate(sorted_series[:top_n]):
	print(f"{i+1}. {series.title()}: {data['tag_count']} tags, {data['character_count']} characters")

	# Save to file if output path provided
	if output_path:
	save_series_data(series_data, output_path)

	return series_data

	def main():
	parser = argparse.ArgumentParser(description='Group tags by series from metadata.json')
	parser.add_argument('--metadata_path', type=str, required=True, help='Path to metadata.json')
	parser.add_argument('--output_path', type=str, default='series_groups.json', help='Output path for series groups')
	parser.add_argument('--top_n', type=int, default=10, help='Number of top series to display')

	args = parser.parse_args()

	try:
	find_and_print_top_series(args.metadata_path, args.output_path, args.top_n)
	except Exception as e:
	print(f"Error: {str(e)}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	main()