source-engine/sfmobjects/sfmphonemeextractor.cpp

//========= Copyright Valve Corporation, All rights reserved. ============//
//
// Purpose:
//
//=============================================================================

#include "sfmobjects/SFMPhonemeExtractor.h"
#include "tier2/riff.h"
#include "PhonemeConverter.h"
#include "filesystem.h"
#include "tier1/utlbuffer.h"
#include "sentence.h"
#include "movieobjects/dmesound.h"
#include "movieobjects/dmeanimationset.h"
#include "movieobjects/dmebookmark.h"
#include "movieobjects/dmeclip.h"
#include "movieobjects/dmechannel.h"
#include "soundchars.h"
#include "tier2/p4helpers.h"
#include "tier2/soundutils.h"
#include "tier1/utldict.h"

#include <windows.h>  // WAVEFORMATEX, WAVEFORMAT and ADPCM WAVEFORMAT!!!
#include <mmreg.h>

// memdbgon must be the last include file in a .cpp file!!!
#include "tier0/memdbgon.h"


static const char *s_pAttributeValueNames[LOG_PREVIEW_FLEX_CHANNEL_COUNT] =
{
	"value",
	"balance",
	"multilevel"
};

static const char *s_pDefaultAttributeValueNames[LOG_PREVIEW_FLEX_CHANNEL_COUNT] =
{
	"defaultValue",
	"defaultBalance",
	"defaultMultilevel"
};


struct Extractor
{
	PE_APITYPE			apitype;
	CSysModule			*module;
	IPhonemeExtractor	*extractor;
};


//-----------------------------------------------------------------------------
// Implementations of the phoneme extractor
//-----------------------------------------------------------------------------
class CSFMPhonemeExtractor : public ISFMPhonemeExtractor
{
public:
	CSFMPhonemeExtractor();

	// Inherited from ISFMPhonemeExtractor
	virtual bool Init();
	virtual void Shutdown();
	virtual int GetAPICount();
	virtual void GetAPIInfo( int index, CUtlString* pPrintName, PE_APITYPE *pAPIType );
	virtual void Extract( const PE_APITYPE& apiType, ExtractDesc_t& info, bool bWritePhonemesToWavFiles );
	virtual void ReApply( ExtractDesc_t& info );
	virtual bool GetSentence( CDmeGameSound *gameSound, CSentence& sentence );

private:
	int FindExtractor( PE_APITYPE type );
	bool GetWaveFormat( const char *filename, CUtlBuffer* pFormat, int *pDataSize, CSentence& sentence, bool &bGotSentence );
	void LogPhonemes( int nItemIndex, ExtractDesc_t& info );
	void ClearInterstitialSpaces( CDmeChannelsClip *pChannelsClip, CUtlDict< LogPreview_t *, int >& controlLookup, ExtractDesc_t& info );

	void StampControlValueLogs( CDmePreset *preset, DmeTime_t tHeadPosition, float flIntensity, CUtlDict< LogPreview_t *, int > &controlLookup );
	void WriteCurrentValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup );
	void WriteDefaultValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup );
	void BuildPhonemeLogList( CUtlVector< LogPreview_t > &list, CUtlVector< CDmeLog * > &logs );
	CDmeChannelsClip* FindFacialChannelsClip( const CUtlVector< LogPreview_t > &list );
	void BuildPhonemeToPresetMapping( const CUtlVector< CBasePhonemeTag * > &stream, CDmeAnimationSet *pSet, CDmePresetGroup * pPresetGroup, CUtlDict< CDmePreset *, unsigned short > &phonemeToPresetDict );

	CUtlVector< Extractor >	m_Extractors;
	int m_nCurrentExtractor;
};


//-----------------------------------------------------------------------------
// Singleton
//-----------------------------------------------------------------------------
static CSFMPhonemeExtractor g_ExtractorSingleton;
ISFMPhonemeExtractor *sfm_phonemeextractor = &g_ExtractorSingleton;


//-----------------------------------------------------------------------------
// Constructor
//-----------------------------------------------------------------------------
CSFMPhonemeExtractor::CSFMPhonemeExtractor() : m_nCurrentExtractor( -1 )
{
}


//-----------------------------------------------------------------------------
// Init, shutdown
//-----------------------------------------------------------------------------
bool CSFMPhonemeExtractor::Init()
{
	// Enumerate modules under bin folder of exe
	FileFindHandle_t findHandle;
	const char *pFilename = g_pFullFileSystem->FindFirstEx( "phonemeextractors/*.dll", "EXECUTABLE_PATH", &findHandle );
	while( pFilename )
	{
		char fullpath[ 512 ];
		Q_snprintf( fullpath, sizeof( fullpath ), "phonemeextractors/%s", pFilename );

		// Msg( "Loading extractor from %s\n", fullpath );

		Extractor e;
		e.module = g_pFullFileSystem->LoadModule( fullpath );
		if ( !e.module )
		{
			pFilename = g_pFullFileSystem->FindNext( findHandle );
			continue;
		}

		CreateInterfaceFn factory = Sys_GetFactory( e.module );
		if ( !factory )
		{
			pFilename = g_pFullFileSystem->FindNext( findHandle );
			continue;
		}

		e.extractor = ( IPhonemeExtractor * )factory( VPHONEME_EXTRACTOR_INTERFACE, NULL );
		if ( !e.extractor )
		{
			Warning( "Unable to get IPhonemeExtractor interface version %s from %s\n", VPHONEME_EXTRACTOR_INTERFACE, fullpath );
			pFilename = g_pFullFileSystem->FindNext( findHandle );
			continue;
		}

		e.apitype = e.extractor->GetAPIType();

		m_Extractors.AddToTail( e );
		pFilename = g_pFullFileSystem->FindNext( findHandle );
	}

	g_pFullFileSystem->FindClose( findHandle );
	return true;
}

void CSFMPhonemeExtractor::Shutdown()
{
	int c = m_Extractors.Count();
	for ( int i = c - 1; i >= 0; i-- )
	{
		Extractor *e = &m_Extractors[ i ];
		g_pFullFileSystem->UnloadModule( e->module );
	}

	m_Extractors.RemoveAll();
}


//-----------------------------------------------------------------------------
// Finds an extractor of a particular type
//-----------------------------------------------------------------------------
int CSFMPhonemeExtractor::FindExtractor( PE_APITYPE type )
{
	for ( int i=0; i < m_Extractors.Count(); i++ )
	{
		if ( m_Extractors[i].apitype == type )
			return i;
	}
	return -1;
}


//-----------------------------------------------------------------------------
// Iterates over extractors
//-----------------------------------------------------------------------------
int CSFMPhonemeExtractor::GetAPICount()
{
	return m_Extractors.Count();
}

void CSFMPhonemeExtractor::GetAPIInfo( int index, CUtlString* pPrintName, PE_APITYPE *pAPIType )
{
	Assert( pPrintName );
	Assert( pAPIType );
	pPrintName->Set( m_Extractors[ index ].extractor->GetName() );
	*pAPIType = m_Extractors[ index ].apitype;
}

static void ParseSentence( CSentence& sentence, IterateRIFF &walk )
{
	CUtlBuffer buf( 0, 0, CUtlBuffer::TEXT_BUFFER );

	buf.EnsureCapacity( walk.ChunkSize() );
	walk.ChunkRead( buf.Base() );
	buf.SeekPut( CUtlBuffer::SEEK_HEAD, walk.ChunkSize() );

	sentence.InitFromDataChunk( buf.Base(), buf.TellPut() );
}

bool CSFMPhonemeExtractor::GetWaveFormat( const char *filename, CUtlBuffer *pBuf, int *pDataSize, CSentence& sentence, bool &bGotSentence )
{
	InFileRIFF riff( filename, *g_pFSIOReadBinary );
	Assert( riff.RIFFName() == RIFF_WAVE );

	// set up the iterator for the whole file (root RIFF is a chunk)
	IterateRIFF walk( riff, riff.RIFFSize() );

	bool gotFmt = false;
	bool gotData = false;
	bGotSentence = false;

	// Walk input chunks and copy to output
	while ( walk.ChunkAvailable() )
	{
		switch ( walk.ChunkName() )
		{
		case WAVE_FMT:
			{
				pBuf->SeekPut( CUtlBuffer::SEEK_HEAD, walk.ChunkSize() );
				walk.ChunkRead( pBuf->Base() );
				gotFmt = true;
			}
			break;
		case WAVE_DATA:
			{
				*pDataSize = walk.ChunkSize();
				gotData = true;
			}
			break;
		case WAVE_VALVEDATA:
			{
				bGotSentence = true;
				ParseSentence( sentence, walk );
			}
			break;
		default:
			break;
		}

		// Done
		if ( gotFmt && gotData && bGotSentence )
			return true;

		walk.ChunkNext();
	}
	return ( gotFmt && gotData );
}

bool CSFMPhonemeExtractor::GetSentence( CDmeGameSound *gameSound, CSentence& sentence )
{
	const char *filename = gameSound->m_SoundName.Get();
	Assert( filename && filename [ 0 ] );

	char soundname[ 512 ];
	// Note, calling PSkipSoundChars to remove any decorator characters used by the engine!!!
	Q_snprintf( soundname, sizeof( soundname ), "sound/%s", PSkipSoundChars( filename ) );
	Q_FixSlashes( soundname );

	char fullpath[ 512 ];
	g_pFullFileSystem->RelativePathToFullPath( soundname, "GAME", fullpath, sizeof( fullpath ) );

	// Get sound file metrics of interest
	CUtlBuffer buf;
	int nDataSize;
	bool bValidSentence = false;
	if ( !GetWaveFormat( soundname, &buf, &nDataSize, sentence, bValidSentence ) )
		return false;

	return bValidSentence;
}

static void BuildPhonemeStream( CSentence& in, CUtlVector< CBasePhonemeTag * >& list )
{
	for ( int i = 0; i < in.m_Words.Count(); ++i )
	{
		CWordTag *w = in.m_Words[ i ];
		if ( !w )
			continue;

		for ( int j = 0; j < w->m_Phonemes.Count(); ++j )
		{
			CPhonemeTag *ph = w->m_Phonemes[ j ];
			if ( !ph )
				continue;

			CBasePhonemeTag *newTag = new CBasePhonemeTag( *ph );
			list.AddToTail( newTag );
		}
	}

	if ( !in.m_Words.Count() && in.m_RunTimePhonemes.Count() )
	{
		for ( int i = 0 ; i < in.m_RunTimePhonemes.Count(); ++i )
		{
			CBasePhonemeTag *newTag = new CBasePhonemeTag( *in.m_RunTimePhonemes[ i ] );
			list.AddToTail( newTag );
		}
	}
}


//-----------------------------------------------------------------------------
// Purpose: Same the phoneme data into the sound files
//-----------------------------------------------------------------------------
static void StoreValveDataChunk( CSentence& sentence, IterateOutputRIFF& store )
{
	// Buffer and dump data
	CUtlBuffer buf( 0, 0, CUtlBuffer::TEXT_BUFFER );

	sentence.SaveToBuffer( buf );

	// Copy into store
	store.ChunkWriteData( buf.Base(), buf.TellPut() );
}

static bool SaveSentenceToWavFile( const char *pWavFile, CSentence& sentence )
{
	char pTempFile[ 512 ];

	Q_StripExtension( pWavFile, pTempFile, sizeof( pTempFile ) );
	Q_DefaultExtension( pTempFile, ".tmp", sizeof( pTempFile ) );

	if ( g_pFullFileSystem->FileExists( pTempFile, "GAME" ) )
	{
		g_pFullFileSystem->RemoveFile( pTempFile, "GAME" );
	}

	CP4AutoEditAddFile p4Checkout( pWavFile );
	if ( !g_pFullFileSystem->IsFileWritable( pWavFile ) )
	{
		Warning( "%s is not writable, can't save sentence data to file\n", pWavFile );
		return false;
	}

	// Rename original pWavFile to temp
	g_pFullFileSystem->RenameFile( pWavFile, pTempFile, "GAME" );

	// NOTE:  Put this in it's own scope so that the destructor for outfileRFF actually closes the file!!!!
	{
		// Read from Temp
		InFileRIFF riff( pTempFile, *g_pFSIOReadBinary );
		Assert( riff.RIFFName() == RIFF_WAVE );

		// set up the iterator for the whole file (root RIFF is a chunk)
		IterateRIFF walk( riff, riff.RIFFSize() );

		// And put data back into original pWavFile by name
		OutFileRIFF riffout( pWavFile, *g_pFSIOWriteBinary );

		IterateOutputRIFF store( riffout );

		bool bWordTrackWritten = false;

		// Walk input chunks and copy to output
		while ( walk.ChunkAvailable() )
		{
			store.ChunkStart( walk.ChunkName() );

			switch ( walk.ChunkName() )
			{
			case WAVE_VALVEDATA:
				{
					// Overwrite data
					StoreValveDataChunk( sentence, store );
					bWordTrackWritten = true;
				}
				break;
			default:
				store.CopyChunkData( walk );
				break;
			}

			store.ChunkFinish();

			walk.ChunkNext();
		}

		// If we didn't write it above, write it now
		if ( !bWordTrackWritten )
		{
			store.ChunkStart( WAVE_VALVEDATA );
			StoreValveDataChunk( sentence, store );
			store.ChunkFinish();
		}
	}

	// Remove temp file
	g_pFullFileSystem->RemoveFile( pTempFile, NULL );

	return true;
}


//-----------------------------------------------------------------------------
// Main entry point for phoneme extraction
//-----------------------------------------------------------------------------
void CSFMPhonemeExtractor::Extract( const PE_APITYPE& apiType, ExtractDesc_t& info, bool bWritePhonemesToWavFiles )
{
	if ( !info.m_pSet )
		return;

	int iExtractor = FindExtractor( apiType );
	if ( iExtractor == -1 )
		return;

	Extractor& extractor = m_Extractors[ iExtractor ];

	int nWorkItem;
	for ( nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem )
	{
		CExtractInfo& workItem = info.m_WorkList[ nWorkItem ];

		workItem.m_flDuration = 0.0f;

		CSentence in;
		CSentence out;
		in.SetText( workItem.m_sHintText.String() );
		out.SetText( workItem.m_sHintText.String() );

		const char *pFileName = workItem.m_pSound->m_SoundName.Get();
		Assert( pFileName && pFileName [ 0 ] );

		char pSoundName[ 512 ];
		// Note, calling PSkipSoundChars to remove any decorator characters used by the engine!!!
		Q_snprintf( pSoundName, sizeof( pSoundName ), "sound/%s", PSkipSoundChars( pFileName ) );
		Q_FixSlashes( pSoundName );

		char pFullPath[ 512 ];
		g_pFullFileSystem->RelativePathToFullPath( pSoundName, "GAME", pFullPath, sizeof( pFullPath ) );

		// Get sound file metrics of interest
		CUtlBuffer buf;
		WAVEFORMATEX *format;
		int nDataSize;
		if ( !GetWaveFormat( pSoundName, &buf, &nDataSize, workItem.m_Sentence, workItem.m_bSentenceValid ) )
			continue;

		format = ( WAVEFORMATEX * )buf.Base();

		if ( !( format->wBitsPerSample > ( 1 << 3 ) ) )
		{
			// Have to warn and early-out here to avoid crashing with "integer divide by zero" below
			Warning( "Cannot extract phonemes from '%s', %u bits per sample.\n", pSoundName, format->wBitsPerSample );
			continue;
		}

		int nBitsPerSample = format->wBitsPerSample;
		float flSampleRate = (float)format->nSamplesPerSec;
		int nChannels = format->nChannels;
		int nSampleCount = nDataSize / ( nBitsPerSample >> 3 );

		float flTrueSampleSize = ( nBitsPerSample * nChannels ) >> 3;
		if ( format->wFormatTag == WAVE_FORMAT_ADPCM )
		{
			nBitsPerSample = 16;
			flTrueSampleSize = 0.5f;

			ADPCMWAVEFORMAT *pFormat = (ADPCMWAVEFORMAT *)buf.Base();
			int blockSize = ((pFormat->wSamplesPerBlock - 2) * pFormat->wfx.nChannels ) / 2;
			blockSize += 7 * pFormat->wfx.nChannels;

			int blockCount = nDataSize / blockSize;
			int blockRem = nDataSize % blockSize;

			// total samples in complete blocks
			nSampleCount = blockCount * pFormat->wSamplesPerBlock;

			// add remaining in a short block
			if ( blockRem )
			{
				nSampleCount += pFormat->wSamplesPerBlock - (((blockSize - blockRem) * 2) / nChannels);
			}
		}

		if ( flSampleRate > 0.0f )
		{
			workItem.m_flDuration = (float)nSampleCount / flSampleRate;
		}
		in.CreateEventWordDistribution( workItem.m_sHintText.String(), workItem.m_flDuration );
		if ( !workItem.m_bUseSentence || !workItem.m_bSentenceValid )
		{
			extractor.extractor->Extract( pFullPath,
				(int)( workItem.m_flDuration * flSampleRate * flTrueSampleSize ),
				Msg, in, out );

			// Tracker 57389:
			// Total hack to fix a bug where the Lipsinc extractor is messing up the # channels on 16 bit stereo waves
			if ( apiType == SPEECH_API_LIPSINC && nChannels == 2 && nBitsPerSample == 16 )
			{
				flTrueSampleSize *= 2.0f;
			}

			float bytespersecond = flSampleRate * flTrueSampleSize;

			int i;
			// Now convert byte offsets to times
			for ( i = 0; i < out.m_Words.Size(); i++ )
			{
				CWordTag *tag = out.m_Words[ i ];
				Assert( tag );
				if ( !tag )
					continue;

				tag->m_flStartTime = ( float )(tag->m_uiStartByte ) / bytespersecond;
				tag->m_flEndTime = ( float )(tag->m_uiEndByte ) / bytespersecond;

				for ( int j = 0; j < tag->m_Phonemes.Size(); j++ )
				{
					CPhonemeTag *ptag = tag->m_Phonemes[ j ];
					Assert( ptag );
					if ( !ptag )
						continue;

					ptag->SetStartTime( ( float )(ptag->m_uiStartByte ) / bytespersecond );
					ptag->SetEndTime( ( float )(ptag->m_uiEndByte ) / bytespersecond );
				}
			}

			if ( bWritePhonemesToWavFiles )
			{
				SaveSentenceToWavFile( pFullPath, out );
			}
		}
		else
		{
			Msg( "Using .wav file phonemes for (%s)\n", pSoundName );
			out = workItem.m_Sentence;
		}

		// Now create channel data
		workItem.ClearTags();
		BuildPhonemeStream( out, workItem.m_ApplyTags );
	}

	if ( info.m_bCreateBookmarks )
	{
		info.m_pSet->GetBookmarks().RemoveAll();
	}

	for ( nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem )
	{
		LogPhonemes( nWorkItem, info );
	}
}


//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
static bool UniquePhonemeLessFunc( CBasePhonemeTag * const & lhs, CBasePhonemeTag * const & rhs )
{
	return lhs->GetPhonemeCode() < rhs->GetPhonemeCode();
}

void CSFMPhonemeExtractor::BuildPhonemeToPresetMapping( const CUtlVector< CBasePhonemeTag * > &stream,
													   CDmeAnimationSet *pSet, CDmePresetGroup *pPresetGroup, CUtlDict< CDmePreset *, unsigned short > &phonemeToPresetDict )
{
	int i;
	CUtlRBTree< CBasePhonemeTag * > uniquePhonemes( 0, 0, UniquePhonemeLessFunc );
	for ( i = 0; i < stream.Count(); ++i )
	{
		CBasePhonemeTag *tag = stream[ i ];
		if ( uniquePhonemes.Find( tag ) == uniquePhonemes.InvalidIndex() )
		{
			uniquePhonemes.Insert( tag );
		}
	}

	for ( i = uniquePhonemes.FirstInorder(); i != uniquePhonemes.InvalidIndex(); i = uniquePhonemes.NextInorder( i ) )
	{
		CBasePhonemeTag *tag = uniquePhonemes[ i ];
		// Convert phoneme code to text
		char ph[ 32 ];
		Q_strncpy( ph, ConvertPhoneme( tag->GetPhonemeCode() ), sizeof( ph ) );

		char remappedph[ 32 ];
		// By default we search for a preset name p_xxx where xxx is the phoneme string
		Q_snprintf( remappedph, sizeof( remappedph ), "p_%s", ph );
		// Now find the preset in the animation set converter
		CDmePhonemeMapping *mapping = pSet->FindMapping( ph );
		if ( mapping )
		{
			Q_strncpy( remappedph, mapping->GetValueString( "preset" ), sizeof( remappedph ) );
		}

		// Now look up the preset, if it exists
		CDmePreset *preset = pPresetGroup->FindPreset( remappedph );
		if ( !preset )
		{
			Warning( "Animation set '%s' missing phoneme preset for '%s' -> '%s'\n",
				pSet->GetName(), ph, remappedph );
			continue;
		}

		// Add to dictionary if it's not already there
		if ( phonemeToPresetDict.Find( ph ) == phonemeToPresetDict.InvalidIndex() )
		{
			phonemeToPresetDict.Insert( ph, preset );
		}
	}
}


//-----------------------------------------------------------------------------
// Finds the channels clip which refers to facial control values
//-----------------------------------------------------------------------------
CDmeChannelsClip* CSFMPhonemeExtractor::FindFacialChannelsClip( const CUtlVector< LogPreview_t > &list )
{
	CDmeChannelsClip *pChannelsClip = NULL;

	int i;
	for ( i = list.Count() - 1; i >= 0; --i )
	{
		const LogPreview_t &lp = list[i];
		CDmeChannelsClip *check = FindAncestorReferencingElement< CDmeChannelsClip >( (CDmElement *)lp.m_hChannels[ 0 ].Get() );

		if ( !pChannelsClip && check )
		{
			pChannelsClip = check;
		}
		else
		{
			if ( pChannelsClip != check )
			{
				Warning( "Selected controls overlap multiple channels clips!!!\n" );
			}
		}
	}

	if ( !pChannelsClip )
	{
		Warning( "Unable to determine destination channels clip!!!\n" );
	}

	return pChannelsClip;
}


//-----------------------------------------------------------------------------
// Builds the list of logs which target facial control values
//-----------------------------------------------------------------------------
void CSFMPhonemeExtractor::BuildPhonemeLogList( CUtlVector< LogPreview_t > &list, CUtlVector< CDmeLog * > &logs )
{
	for ( int i = 0; i < list.Count(); ++i )
	{
		LogPreview_t& p = list[ i ];

		for ( int channel = 0; channel < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++channel )
		{
			CDmeChannel *ch = p.m_hChannels[ channel ];
			if ( !ch )
				continue;

			CDmeLog *log = p.m_hChannels[ channel ]->GetLog();
			if ( !log )
				continue;

			logs.AddToTail( log );
		}
	}
}


//-----------------------------------------------------------------------------
// Writes default values into all log layers targetting facial control values
//-----------------------------------------------------------------------------
void CSFMPhonemeExtractor::WriteDefaultValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup )
{
	// Write a zero into all relevant log layers
	for ( int j = controlLookup.First(); j != controlLookup.InvalidIndex(); j = controlLookup.Next( j ) )
	{
		LogPreview_t* lp = controlLookup[ j ];

		CDmElement *pControl = lp->m_hControl;

		for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex )
		{
			CDmeChannel *pChannel = lp->m_hChannels[ chIndex ];
			if ( !pChannel )
				continue;

			// Now get the log for the channel
			CDmeFloatLog *pFloatLog = CastElement< CDmeFloatLog >( pChannel->GetLog() );
			if ( !pFloatLog )
				continue;

			CDmeFloatLogLayer *pLayer = pFloatLog->GetLayer( pFloatLog->GetTopmostLayer() );
			if ( !pLayer )
				continue;

			float flDefaultValue = pControl->GetValue< float >( s_pDefaultAttributeValueNames[chIndex] );
			pLayer->InsertKey( tHeadPosition, flDefaultValue );
		}
	}
}


//-----------------------------------------------------------------------------
// Creates a new log key based on the interpolated value at that time
//-----------------------------------------------------------------------------
void CSFMPhonemeExtractor::WriteCurrentValuesIntoLogLayers( DmeTime_t tHeadPosition, const CUtlDict< LogPreview_t *, int > &controlLookup )
{
	// Write a zero into all relevant log layers
	for ( int j = controlLookup.First(); j != controlLookup.InvalidIndex(); j = controlLookup.Next( j ) )
	{
		LogPreview_t* lp = controlLookup[ j ];

		for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex )
		{
			CDmeChannel *pChannel = lp->m_hChannels[ chIndex ];
			if ( !pChannel )
				continue;

			// Now get the log for the channel
			CDmeFloatLog *pFloatLog = CastElement< CDmeFloatLog >( pChannel->GetLog() );
			if ( !pFloatLog )
				continue;

			CDmeFloatLogLayer *pLayer = pFloatLog->GetLayer( pFloatLog->GetTopmostLayer() );
			if ( !pLayer )
				continue;

			float flCurrentValue = pLayer->GetValue( tHeadPosition );
			pLayer->InsertKey( tHeadPosition, flCurrentValue );
		}
	}
}


//-----------------------------------------------------------------------------
// Samples extracted phoneme data and stamps that values into control value logs
//-----------------------------------------------------------------------------
void CSFMPhonemeExtractor::StampControlValueLogs( CDmePreset *preset, DmeTime_t tHeadPosition, float flIntensity, CUtlDict< LogPreview_t *, int > &controlLookup )
{
	// Now walk the logs required by the preset
	const CDmrElementArray< CDmElement > &controlValues = preset->GetControlValues( );
	for ( int j = 0; j < controlValues.Count(); ++j )
	{
		// This control contains the preset value
		CDmElement *presetControl = controlValues[ j ];
		if ( !presetControl )
			continue;

		int visIndex = controlLookup.Find( presetControl->GetName() );
		if ( visIndex == controlLookup.InvalidIndex() )
			continue;

		LogPreview_t* lp = controlLookup[ visIndex ];

		for ( int chIndex = 0; chIndex < LOG_PREVIEW_FLEX_CHANNEL_COUNT; ++chIndex )
		{
			CDmeChannel *ch = lp->m_hChannels[ chIndex ];
			if ( !ch )
				continue;

			// Whereas this control contains the "default" value for the slider (since the presetControl won't have that value)
			CDmElement *defaultValueControl = lp->m_hControl.Get();
			if ( !defaultValueControl )
				continue;

			// Now get the log for the channel
			CDmeLog *log = ch->GetLog();
			if ( !log )
			{
				Assert( 0 );
				continue;
			}

			CDmeFloatLog *floatLog = CastElement< CDmeFloatLog >( log );
			if ( !floatLog )
				continue;

			CDmeFloatLogLayer *pLayer = floatLog->GetLayer( floatLog->GetTopmostLayer() );
			if ( !pLayer )
				continue;

			float flDefault = defaultValueControl->GetValue< float >( s_pDefaultAttributeValueNames[chIndex] );
			float flControlValue = presetControl->GetValue< float >( s_pAttributeValueNames[ chIndex ] );
			float flNewValue = flIntensity * ( flControlValue - flDefault );
			float flCurrent = pLayer->GetValue( tHeadPosition ) - flDefault;
			// Accumulate new value into topmost layer
			pLayer->InsertKey( tHeadPosition, flCurrent + flNewValue + flDefault );
		}
	}
}

void CSFMPhonemeExtractor::ClearInterstitialSpaces( CDmeChannelsClip *pChannelsClip, CUtlDict< LogPreview_t *, int >& controlLookup, ExtractDesc_t& info )
{
	Assert( info.m_pShot );
	Assert( pChannelsClip );

	if ( info.m_WorkList.Count() == 0 )
		return;

	// This is handled by the main layering code...
	if ( info.m_nExtractType == EXTRACT_WIPE_SOUNDS )
		return;

	// Now walk through all relevant logs
	CUtlVector< CDmeLog * > logs;
	BuildPhonemeLogList( info.m_ControlList, logs );

	DmeTime_t tMinTime( DMETIME_MAXTIME );
	DmeTime_t tMaxTime( DMETIME_MINTIME );

	int i;
	// Walk work items and figure out time bounds
	for ( i = 0; i < info.m_WorkList.Count(); ++i )
	{
		CExtractInfo &item = info.m_WorkList[ i ];

		CUtlVector< CDmeHandle< CDmeClip > > srcStack;
		CUtlVector< CDmeHandle< CDmeClip > > dstStack;

		// Convert original .wav start to animation set channels clip relative time
		item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, info.m_pShot );

		// NOTE: Time bounds measured in sound media time goes from 0 -> flWaveDuration
		DmeTime_t tSoundMediaStartTime = CDmeClip::FromChildMediaTime( srcStack, DMETIME_ZERO, false );
		DmeTime_t tSoundMediaEndTime   = CDmeClip::FromChildMediaTime( srcStack, DmeTime_t( item.m_flDuration ), false );

		// NOTE: Start and end time are measured in sound media time
		DmeTime_t tStartTime = item.m_pClip->GetStartInChildMediaTime();
		DmeTime_t tEndTime   = item.m_pClip->GetEndInChildMediaTime();

		// And convert back down into channels clip relative time
		pChannelsClip->BuildClipStack( &dstStack, info.m_pMovie, info.m_pShot );

		// Now convert back down to channels clip relative time
		DmeTime_t tChannelMediaStartTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaStartTime, false );
		DmeTime_t tChannelMediaEndTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaEndTime, false );

		// Find a scale + offset which transforms data in media space of the sound [namely, the phonemes]
		// into the media space of the channels [the logs that drive the facial animation]
		DmeTime_t tEndDuration = tChannelMediaEndTime - tChannelMediaStartTime;
		double flScale = ( item.m_flDuration != 0.0f ) ? tEndDuration.GetSeconds() / item.m_flDuration : 0.0f;
		DmeTime_t tOffset = tChannelMediaStartTime;

		DmeTime_t tChannelRelativeStartTime( tStartTime * flScale );
		tChannelRelativeStartTime += tOffset;
		DmeTime_t tChannelRelativeEndTime( tEndTime * flScale );
		tChannelRelativeEndTime += tOffset;

		if ( tChannelRelativeStartTime < tMinTime )
		{
			tMinTime = tChannelRelativeStartTime;
		}
		if ( tChannelRelativeEndTime > tMaxTime )
		{
			tMaxTime = tChannelRelativeEndTime;
		}
	}

	// Bloat by one quantum
	tMinTime -= DMETIME_MINDELTA;
	tMaxTime += DMETIME_MINDELTA;

	for ( i = 0; i < logs.Count(); ++i )
	{
		CDmeLog *log = logs[ i ];

		Assert( log->GetNumLayers() == 1 );
		CDmeLogLayer *layer = log->GetLayer( log->GetTopmostLayer() );

		if ( info.m_nExtractType == EXTRACT_WIPE_RANGE )
		{
			// Write default value keys into log
			// Write a default value at that time
			WriteDefaultValuesIntoLogLayers( tMinTime, controlLookup );

			// Write a default value at that time
			WriteDefaultValuesIntoLogLayers( tMaxTime, controlLookup );

			// Now discard all keys > tMinTime and < tMaxTime
			for ( int j = layer->GetKeyCount() - 1; j >= 0; --j )
			{
				DmeTime_t &t = layer->GetKeyTime( j );
				if ( t <= tMinTime )
					continue;
				if ( t >= tMaxTime )
					continue;

				layer->RemoveKey( j );
			}
		}
		else
		{
			Assert( info.m_nExtractType == EXTRACT_WIPE_CLIP );
			layer->ClearKeys();
		}
	}
}

void AddAnimSetBookmarkAtSoundMediaTime( const char *pName, DmeTime_t tStart, DmeTime_t tEnd, const CUtlVector< CDmeHandle< CDmeClip > > &srcStack, ExtractDesc_t& info )
{
	tStart = CDmeClip::FromChildMediaTime( srcStack, tStart, false );
	tEnd   = CDmeClip::FromChildMediaTime( srcStack, tEnd, false );

	tStart = info.m_pShot->ToChildMediaTime( tStart, false );
	tEnd   = info.m_pShot->ToChildMediaTime( tEnd, false );

	CDmeBookmark *pBookmark = CreateElement< CDmeBookmark >( pName );
	pBookmark->SetNote( pName );
	pBookmark->SetTime( tStart );
	pBookmark->SetDuration( tEnd - tStart );
	info.m_pSet->GetBookmarks().AddToTail( pBookmark );
}

//-----------------------------------------------------------------------------
// Main entry point for generating phoneme logs
//-----------------------------------------------------------------------------
void CSFMPhonemeExtractor::LogPhonemes( int nItemIndex,	ExtractDesc_t& info )
{
	CExtractInfo &item = info.m_WorkList[ nItemIndex ];

	// Validate input parameters
	Assert( info.m_pSet && item.m_pClip && item.m_pSound );
	if ( !info.m_pSet || !item.m_pClip || !item.m_pSound )
		return;

	CDmePresetGroup *pPresetGroup = info.m_pSet->FindPresetGroup( "phoneme" );
	if ( !pPresetGroup )
	{
		Warning( "Animation set '%s' missing preset group 'phoneme'\n", info.m_pSet->GetName() );
		return;
	}

	if ( !info.m_pSet->GetPhonemeMap().Count() )
	{
		info.m_pSet->RestoreDefaultPhonemeMap();
	}

	// Walk through phoneme stack and build list of unique presets
	CUtlDict< CDmePreset *, unsigned short > phonemeToPresetDict;
	BuildPhonemeToPresetMapping( item.m_ApplyTags, info.m_pSet, pPresetGroup, phonemeToPresetDict );

	CDmeChannelsClip *pChannelsClip = FindFacialChannelsClip( info.m_ControlList );
	if ( !pChannelsClip )
		return;

	// Build a fast lookup of the visible sliders
	int i;
	CUtlDict< LogPreview_t *, int > controlLookup;
	for ( i = 0; i < info.m_ControlList.Count(); ++i )
	{
		controlLookup.Insert( info.m_ControlList[ i ].m_hControl->GetName(), &info.m_ControlList[ i ] );
	}

	// Only need to do this on the first item and we have multiple .wavs selected
	if ( nItemIndex == 0 && info.m_WorkList.Count() > 1 )
	{
		ClearInterstitialSpaces( pChannelsClip, controlLookup, info );
	}

	// Set up time selection, put channels into record and stamp out keyframes

	// Convert original .wav start to animation set channels clip relative time
	CUtlVector< CDmeHandle< CDmeClip > > srcStack;
	item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, info.m_pShot );
	if ( srcStack.Count() == 0 )
	{
		item.m_pClip->BuildClipStack( &srcStack, info.m_pMovie, NULL );
		if ( srcStack.Count() == 0 )
		{
			Msg( "Couldn't build stack sound clip to current shot\n" );
			return;
		}
	}

	// NOTE: Time bounds measured in sound media time goes from 0 -> flWaveDuration
	DmeTime_t tSoundMediaStartTime = CDmeClip::FromChildMediaTime( srcStack, DMETIME_ZERO, false );
	DmeTime_t tSoundMediaEndTime   = CDmeClip::FromChildMediaTime( srcStack, DmeTime_t( item.m_flDuration ), false );

	// NOTE: Start and end time are measured in sound media time
	DmeTime_t tStartTime = item.m_pClip->GetStartInChildMediaTime();
	DmeTime_t tEndTime   = item.m_pClip->GetEndInChildMediaTime();

	// And convert back down into channels clip relative time
	CUtlVector< CDmeHandle< CDmeClip > > dstStack;
	pChannelsClip->BuildClipStack( &dstStack, info.m_pMovie, info.m_pShot );

	// Now convert back down to channels clip relative time
	DmeTime_t tChannelMediaStartTime = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaStartTime, false );
	DmeTime_t tChannelMediaEndTime   = CDmeClip::ToChildMediaTime( dstStack, tSoundMediaEndTime, false );

	// Find a scale + offset which transforms data in media space of the sound [namely, the phonemes]
	// into the media space of the channels [the logs that drive the facial animation]
	DmeTime_t tEndDuration = tChannelMediaEndTime - tChannelMediaStartTime;
	double flScale = ( item.m_flDuration != 0.0f ) ? tEndDuration.GetSeconds() / item.m_flDuration : 0.0f;
	DmeTime_t tOffset = tChannelMediaStartTime;

	CUtlVector< CDmeLog * > logs;
	BuildPhonemeLogList( info.m_ControlList, logs );

	// Add new write layer to each recording log
	for ( i = 0; i < logs.Count(); ++i )
	{
		logs[ i ]->AddNewLayer();
	}

	// Iterate over the entire range of the sound
	double flStartSoundTime = max( 0, tStartTime.GetSeconds() );
	double flEndSoundTime = min( item.m_flDuration, tEndTime.GetSeconds() );

	// Stamp keys right before and after the sound so as to
	// not generate new values outside the import time range
	DmeTime_t tPrePhonemeTime( flStartSoundTime * flScale );
	tPrePhonemeTime += tOffset - DMETIME_MINDELTA;
	WriteCurrentValuesIntoLogLayers( tPrePhonemeTime, controlLookup );

	DmeTime_t tPostPhonemeTime( flEndSoundTime * flScale );
	tPostPhonemeTime += tOffset + DMETIME_MINDELTA;
	WriteCurrentValuesIntoLogLayers( tPostPhonemeTime, controlLookup );

	// add bookmarks
	if ( info.m_bCreateBookmarks )
	{
		AddAnimSetBookmarkAtSoundMediaTime( "start", tPrePhonemeTime, tPrePhonemeTime, srcStack, info );

		for ( i = 0; i < item.m_ApplyTags.Count() ; ++i )
		{
			CBasePhonemeTag *p = item.m_ApplyTags[ i ];
			const char *pPhonemeName = ConvertPhoneme( p->GetPhonemeCode() );
			DmeTime_t tStart = DmeTime_t( p->GetStartTime() );
			DmeTime_t tEnd   = DmeTime_t( p->GetEndTime() );
			AddAnimSetBookmarkAtSoundMediaTime( pPhonemeName, tStart, tEnd, srcStack, info );
		}

		AddAnimSetBookmarkAtSoundMediaTime( "end", tPostPhonemeTime, tPostPhonemeTime, srcStack, info );
	}

	if ( info.m_nFilterType == EXTRACT_FILTER_HOLD || info.m_nFilterType == EXTRACT_FILTER_LINEAR )
	{
		CDmePreset *pLastPreset = NULL;

		for ( i = 0; i < item.m_ApplyTags.Count() ; ++i )
		{
			CBasePhonemeTag *p = item.m_ApplyTags[ i ];

			DmeTime_t tStart = DmeTime_t( p->GetStartTime() );
			DmeTime_t tEnd   = DmeTime_t( p->GetEndTime() );

			int idx = phonemeToPresetDict.Find( ConvertPhoneme( p->GetPhonemeCode() ) );
			if ( idx == phonemeToPresetDict.InvalidIndex() )
				continue;

			CDmePreset *preset = phonemeToPresetDict[ idx ];
			if ( !preset )
				continue;

			DmeTime_t tKeyTime = tStart * flScale + tOffset;

			if ( info.m_nFilterType == EXTRACT_FILTER_HOLD )
			{
				// stamp value at end of phoneme (or default prior to first phoneme)
				// NOTE - this ignores phoneme length, but since all phonemes directly abut one another, this doesn't matter
				DmeTime_t tLastEnd = tKeyTime - DMETIME_MINDELTA;
				if ( tLastEnd > tPrePhonemeTime )
				{
					WriteDefaultValuesIntoLogLayers( tKeyTime - DMETIME_MINDELTA, controlLookup );
					if ( pLastPreset )
					{
						StampControlValueLogs( pLastPreset, tKeyTime - DMETIME_MINDELTA, 1.0f, controlLookup );
					}
				}
				pLastPreset = preset;
			}

			WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup );
			StampControlValueLogs( preset, tKeyTime, 1.0f, controlLookup );

			if ( info.m_nFilterType == EXTRACT_FILTER_HOLD && i == item.m_ApplyTags.Count() - 1 )
			{
				// stamp value at end of last phoneme
				tKeyTime = tEnd * flScale + tOffset;
				tKeyTime = min( tKeyTime, tPostPhonemeTime );
				WriteDefaultValuesIntoLogLayers( tKeyTime - DMETIME_MINDELTA, controlLookup );
				StampControlValueLogs( preset, tKeyTime - DMETIME_MINDELTA, 1.0f, controlLookup );

				// stamp default just after end of last phoneme to hold silence until tPostPhonemeTime
				WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup );
			}
		}
	}
	else
	{
		Assert( info.m_nFilterType == EXTRACT_FILTER_FIXED_WIDTH );

		double tStep = 1.0 / (double)clamp( info.m_flSampleRateHz, 1.0f, 1000.0f );

		float flFilter = max( info.m_flSampleFilterSize, 0.001f );
		float flOOFilter = 1.0f / flFilter;

		for ( double t = flStartSoundTime; t < flEndSoundTime; t += tStep )
		{
			DmeTime_t tPhonemeTime( t );

			// Determine the location of the sample in the channels clip
			DmeTime_t tKeyTime( t * flScale );
			tKeyTime += tOffset;

			// Write a default value at that time
			WriteDefaultValuesIntoLogLayers( tKeyTime, controlLookup );

			// Walk phonemes...
			for ( i = 0; i < item.m_ApplyTags.Count() ; ++i )
			{
				CBasePhonemeTag *p = item.m_ApplyTags[ i ];

				DmeTime_t tStart = DmeTime_t( p->GetStartTime() );
				DmeTime_t tEnd = DmeTime_t( p->GetEndTime() );

				bool bContinue = false;
				float flI = 0.0f;
				{
					DmeTime_t tFilter( flFilter );
					if ( tStart >= tPhonemeTime + tFilter || tEnd <= tPhonemeTime )
						bContinue = true;

					tStart = max( tStart, tPhonemeTime );
					tEnd = min( tEnd, tPhonemeTime + tFilter );

					flI = ( tEnd - tStart ).GetSeconds() * flOOFilter;
				}

				DmeTime_t dStart = tStart - tPhonemeTime;
				DmeTime_t dEnd = tEnd - tPhonemeTime;

				float t1 = dStart.GetSeconds() * flOOFilter;
				float t2 = dEnd.GetSeconds() * flOOFilter;

				Assert( bContinue == !( t1 < 1.0f && t2 > 0.0f ) );
				if ( !( t1 < 1.0f && t2 > 0.0f ) )
					continue;

				if ( t2 > 1 )
				{
					t2 = 1;
				}
				if ( t1 < 0 )
				{
					t1 = 0;
				}

				float flIntensity = ( t2 - t1 );
				Assert( fabs( flI - flIntensity ) < 0.000001f );

				int idx = phonemeToPresetDict.Find( ConvertPhoneme( p->GetPhonemeCode() ) );
				if ( idx == phonemeToPresetDict.InvalidIndex() )
					continue;

				CDmePreset *preset = phonemeToPresetDict[ idx ];
				if ( !preset )
					continue;

				StampControlValueLogs( preset, tKeyTime, flIntensity, controlLookup );
			}
		}
	}

	// Flatten write layers
	for ( i = 0; i < logs.Count(); ++i )
	{
		logs[ i ]->FlattenLayers( DMELOG_DEFAULT_THRESHHOLD, CDmeLog::FLATTEN_NODISCONTINUITY_FIXUP );
	}
}

void CSFMPhonemeExtractor::ReApply( ExtractDesc_t& info )
{
	if ( info.m_bCreateBookmarks )
	{
		info.m_pSet->GetBookmarks().RemoveAll();
	}

	for ( int nWorkItem = 0; nWorkItem < info.m_WorkList.Count(); ++nWorkItem )
	{
		LogPhonemes( nWorkItem, info );
	}
}