{"version":"1.0","workflow_uuid":"b8e5b8fc-5318-11f1-9bc6-00163e2b0d79","workflow_title":"Data Juicer — Data Processing Pipeline for Foundation Models","install_contract":{"version":"1.0","installReady":false,"title":"Data Juicer — Data Processing Pipeline for Foundation Models","summary":"Data Juicer is a data processing toolkit designed for building and curating training datasets for large language models and multimodal models. It provides over 100 composable operators for filtering, deduplication, and quality analysis of text, image, audio, and video data.","assetType":"Scripts","pageUrl":"https://tokrepo.com/en/workflows/asset-b8e5b8fc","sourceUrl":"https://github.com/datajuicer/data-juicer","intendedFor":[],"firstActions":[],"agentFirstSteps":[],"targetPaths":[],"verification":[],"startingPoints":[],"example":"","successOutcome":"","boundaries":[],"askUserIf":["the current workspace stack cannot be matched to a safe upstream template","the target path is not the project root, or an existing file should be merged instead of overwritten"]}}